[yt-dlp.git] / yt_dlp / extractor / nebula.py

import itertools
import json
import urllib.error

from .common import InfoExtractor
from ..utils import ExtractorError, parse_iso8601

_BASE_URL_RE = r'https?://(?:www\.)?(?:watchnebula\.com|nebula\.app|nebula\.tv)'


class NebulaBaseIE(InfoExtractor):
    _NETRC_MACHINE = 'watchnebula'

    _nebula_api_token = None
    _nebula_bearer_token = None

    def _perform_nebula_auth(self, username, password):
        if not username or not password:
            self.raise_login_required(method='password')

        data = json.dumps({'email': username, 'password': password}).encode('utf8')
        response = self._download_json(
            'https://api.watchnebula.com/api/v1/auth/login/',
            data=data, fatal=False, video_id=None,
            headers={
                'content-type': 'application/json',
                # Submitting the 'sessionid' cookie always causes a 403 on auth endpoint
                'cookie': ''
            },
            note='Logging in to Nebula with supplied credentials',
            errnote='Authentication failed or rejected')
        if not response or not response.get('key'):
            self.raise_login_required(method='password')

        return response['key']

    def _call_nebula_api(self, url, video_id=None, method='GET', auth_type='api', note=''):
        assert method in ('GET', 'POST',)
        assert auth_type in ('api', 'bearer',)

        def inner_call():
            authorization = f'Token {self._nebula_api_token}' if auth_type == 'api' else f'Bearer {self._nebula_bearer_token}'
            return self._download_json(
                url, video_id, note=note, headers={'Authorization': authorization},
                data=b'' if method == 'POST' else None)

        try:
            return inner_call()
        except ExtractorError as exc:
            # if 401 or 403, attempt credential re-auth and retry
            if exc.cause and isinstance(exc.cause, urllib.error.HTTPError) and exc.cause.code in (401, 403):
                self.to_screen(f'Reauthenticating to Nebula and retrying, because last {auth_type} call resulted in error {exc.cause.code}')
                self._perform_login()
                return inner_call()
            else:
                raise

    def _fetch_nebula_bearer_token(self):
        """
        Get a Bearer token for the Nebula API. This will be required to fetch video meta data.
        """
        response = self._call_nebula_api('https://api.watchnebula.com/api/v1/authorization/',
                                         method='POST',
                                         note='Authorizing to Nebula')
        return response['token']

    def _fetch_video_formats(self, slug):
        stream_info = self._call_nebula_api(f'https://content.watchnebula.com/video/{slug}/stream/',
                                            video_id=slug,
                                            auth_type='bearer',
                                            note='Fetching video stream info')
        manifest_url = stream_info['manifest']
        return self._extract_m3u8_formats_and_subtitles(manifest_url, slug)

    def _build_video_info(self, episode):
        fmts, subs = self._fetch_video_formats(episode['slug'])
        channel_slug = episode['channel_slug']
        channel_title = episode['channel_title']
        return {
            'id': episode['zype_id'],
            'display_id': episode['slug'],
            'formats': fmts,
            'subtitles': subs,
            'webpage_url': f'https://nebula.tv/{episode["slug"]}',
            'title': episode['title'],
            'description': episode['description'],
            'timestamp': parse_iso8601(episode['published_at']),
            'thumbnails': [{
                # 'id': tn.get('name'),  # this appears to be null
                'url': tn['original'],
                'height': key,
            } for key, tn in episode['assets']['thumbnail'].items()],
            'duration': episode['duration'],
            'channel': channel_title,
            'channel_id': channel_slug,
            'channel_url': f'https://nebula.tv/{channel_slug}',
            'uploader': channel_title,
            'uploader_id': channel_slug,
            'uploader_url': f'https://nebula.tv/{channel_slug}',
            'series': channel_title,
            'creator': channel_title,
        }

    def _perform_login(self, username=None, password=None):
        self._nebula_api_token = self._perform_nebula_auth(username, password)
        self._nebula_bearer_token = self._fetch_nebula_bearer_token()


class NebulaIE(NebulaBaseIE):
    _VALID_URL = rf'{_BASE_URL_RE}/videos/(?P<id>[-\w]+)'
    _TESTS = [
        {
            'url': 'https://nebula.tv/videos/that-time-disney-remade-beauty-and-the-beast',
            'md5': '14944cfee8c7beeea106320c47560efc',
            'info_dict': {
                'id': '5c271b40b13fd613090034fd',
                'ext': 'mp4',
                'title': 'That Time Disney Remade Beauty and the Beast',
                'description': 'Note: this video was originally posted on YouTube with the sponsor read included. We weren’t able to remove it without reducing video quality, so it’s presented here in its original context.',
                'upload_date': '20180731',
                'timestamp': 1533009600,
                'channel': 'Lindsay Ellis',
                'channel_id': 'lindsayellis',
                'uploader': 'Lindsay Ellis',
                'uploader_id': 'lindsayellis',
                'timestamp': 1533009600,
                'uploader_url': 'https://nebula.tv/lindsayellis',
                'series': 'Lindsay Ellis',
                'display_id': 'that-time-disney-remade-beauty-and-the-beast',
                'channel_url': 'https://nebula.tv/lindsayellis',
                'creator': 'Lindsay Ellis',
                'duration': 2212,
                'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+\.jpeg?.*',
            },
        },
        {
            'url': 'https://nebula.tv/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore',
            'md5': 'd05739cf6c38c09322422f696b569c23',
            'info_dict': {
                'id': '5e7e78171aaf320001fbd6be',
                'ext': 'mp4',
                'title': 'Landing Craft - How The Allies Got Ashore',
                'description': r're:^In this episode we explore the unsung heroes of D-Day, the landing craft.',
                'upload_date': '20200327',
                'timestamp': 1585348140,
                'channel': 'Real Engineering',
                'channel_id': 'realengineering',
                'uploader': 'Real Engineering',
                'uploader_id': 'realengineering',
                'series': 'Real Engineering',
                'display_id': 'the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore',
                'creator': 'Real Engineering',
                'duration': 841,
                'channel_url': 'https://nebula.tv/realengineering',
                'uploader_url': 'https://nebula.tv/realengineering',
                'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+\.jpeg?.*',
            },
        },
        {
            'url': 'https://nebula.tv/videos/money-episode-1-the-draw',
            'md5': 'ebe28a7ad822b9ee172387d860487868',
            'info_dict': {
                'id': '5e779ebdd157bc0001d1c75a',
                'ext': 'mp4',
                'title': 'Episode 1: The Draw',
                'description': r'contains:There’s free money on offer… if the players can all work together.',
                'upload_date': '20200323',
                'timestamp': 1584980400,
                'channel': 'Tom Scott Presents: Money',
                'channel_id': 'tom-scott-presents-money',
                'uploader': 'Tom Scott Presents: Money',
                'uploader_id': 'tom-scott-presents-money',
                'uploader_url': 'https://nebula.tv/tom-scott-presents-money',
                'duration': 825,
                'channel_url': 'https://nebula.tv/tom-scott-presents-money',
                'series': 'Tom Scott Presents: Money',
                'display_id': 'money-episode-1-the-draw',
                'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+\.jpeg?.*',
                'creator': 'Tom Scott Presents: Money',
            },
        },
        {
            'url': 'https://watchnebula.com/videos/money-episode-1-the-draw',
            'only_matching': True,
        },
    ]

    def _fetch_video_metadata(self, slug):
        return self._call_nebula_api(f'https://content.watchnebula.com/video/{slug}/',
                                     video_id=slug,
                                     auth_type='bearer',
                                     note='Fetching video meta data')

    def _real_extract(self, url):
        slug = self._match_id(url)
        video = self._fetch_video_metadata(slug)
        return self._build_video_info(video)


class NebulaSubscriptionsIE(NebulaBaseIE):
    IE_NAME = 'nebula:subscriptions'
    _VALID_URL = rf'{_BASE_URL_RE}/myshows'
    _TESTS = [
        {
            'url': 'https://nebula.tv/myshows',
            'playlist_mincount': 1,
            'info_dict': {
                'id': 'myshows',
            },
        },
    ]

    def _generate_playlist_entries(self):
        next_url = 'https://content.watchnebula.com/library/video/?page_size=100'
        page_num = 1
        while next_url:
            channel = self._call_nebula_api(next_url, 'myshows', auth_type='bearer',
                                            note=f'Retrieving subscriptions page {page_num}')
            for episode in channel['results']:
                yield self._build_video_info(episode)
            next_url = channel['next']
            page_num += 1

    def _real_extract(self, url):
        return self.playlist_result(self._generate_playlist_entries(), 'myshows')


class NebulaChannelIE(NebulaBaseIE):
    IE_NAME = 'nebula:channel'
    _VALID_URL = rf'{_BASE_URL_RE}/(?!myshows|videos/)(?P<id>[-\w]+)'
    _TESTS = [
        {
            'url': 'https://nebula.tv/tom-scott-presents-money',
            'info_dict': {
                'id': 'tom-scott-presents-money',
                'title': 'Tom Scott Presents: Money',
                'description': 'Tom Scott hosts a series all about trust, negotiation and money.',
            },
            'playlist_count': 5,
        }, {
            'url': 'https://nebula.tv/lindsayellis',
            'info_dict': {
                'id': 'lindsayellis',
                'title': 'Lindsay Ellis',
                'description': 'Enjoy these hottest of takes on Disney, Transformers, and Musicals.',
            },
            'playlist_mincount': 2,
        },
    ]

    def _generate_playlist_entries(self, collection_id, channel):
        episodes = channel['episodes']['results']
        for page_num in itertools.count(2):
            for episode in episodes:
                yield self._build_video_info(episode)
            next_url = channel['episodes']['next']
            if not next_url:
                break
            channel = self._call_nebula_api(next_url, collection_id, auth_type='bearer',
                                            note=f'Retrieving channel page {page_num}')
            episodes = channel['episodes']['results']

    def _real_extract(self, url):
        collection_id = self._match_id(url)
        channel_url = f'https://content.watchnebula.com/video/channels/{collection_id}/'
        channel = self._call_nebula_api(channel_url, collection_id, auth_type='bearer', note='Retrieving channel')
        channel_details = channel['details']

        return self.playlist_result(
            entries=self._generate_playlist_entries(collection_id, channel),
            playlist_id=collection_id,
            playlist_title=channel_details['title'],
            playlist_description=channel_details['description']
        )
Commit	Line	Data
359df0fc	1	import itertools
bdc196a4	2	import json
ac668111	3	import urllib.error
bdc196a4	4
359df0fc	5	from .common import InfoExtractor
d50ea3ce	6	from ..utils import ExtractorError, parse_iso8601
359df0fc	7
4cca2eb1 TA	8	_BASE_URL_RE = r'https?://(?:www\.)?(?:watchnebula\.com\|nebula\.app\|nebula\.tv)'
4cca2eb1 TA	9
359df0fc HH	10
	11	class NebulaBaseIE(InfoExtractor):
	12	_NETRC_MACHINE = 'watchnebula'
	13
	14	_nebula_api_token = None
	15	_nebula_bearer_token = None
359df0fc	16
f3b3fe16 HH	17	def _perform_nebula_auth(self, username, password):
f3b3fe16 HH	18	if not username or not password:
d50ea3ce	19	self.raise_login_required(method='password')
359df0fc HH	20
	21	data = json.dumps({'email': username, 'password': password}).encode('utf8')
	22	response = self._download_json(
	23	'https://api.watchnebula.com/api/v1/auth/login/',
	24	data=data, fatal=False, video_id=None,
	25	headers={
	26	'content-type': 'application/json',
	27	# Submitting the 'sessionid' cookie always causes a 403 on auth endpoint
	28	'cookie': ''
	29	},
	30	note='Logging in to Nebula with supplied credentials',
	31	errnote='Authentication failed or rejected')
	32	if not response or not response.get('key'):
d50ea3ce	33	self.raise_login_required(method='password')
359df0fc HH	34
	35	return response['key']
	36
359df0fc HH	37	def _call_nebula_api(self, url, video_id=None, method='GET', auth_type='api', note=''):
	38	assert method in ('GET', 'POST',)
	39	assert auth_type in ('api', 'bearer',)
bdc196a4	40
359df0fc HH	41	def inner_call():
	42	authorization = f'Token {self._nebula_api_token}' if auth_type == 'api' else f'Bearer {self._nebula_bearer_token}'
	43	return self._download_json(
	44	url, video_id, note=note, headers={'Authorization': authorization},
	45	data=b'' if method == 'POST' else None)
	46
	47	try:
	48	return inner_call()
	49	except ExtractorError as exc:
	50	# if 401 or 403, attempt credential re-auth and retry
	51	if exc.cause and isinstance(exc.cause, urllib.error.HTTPError) and exc.cause.code in (401, 403):
	52	self.to_screen(f'Reauthenticating to Nebula and retrying, because last {auth_type} call resulted in error {exc.cause.code}')
4c268f9c	53	self._perform_login()
359df0fc HH	54	return inner_call()
	55	else:
	56	raise
	57
	58	def _fetch_nebula_bearer_token(self):
	59	"""
	60	Get a Bearer token for the Nebula API. This will be required to fetch video meta data.
	61	"""
	62	response = self._call_nebula_api('https://api.watchnebula.com/api/v1/authorization/',
	63	method='POST',
	64	note='Authorizing to Nebula')
	65	return response['token']
bdc196a4	66
d50ea3ce HH	67	def _fetch_video_formats(self, slug):
	68	stream_info = self._call_nebula_api(f'https://content.watchnebula.com/video/{slug}/stream/',
	69	video_id=slug,
	70	auth_type='bearer',
	71	note='Fetching video stream info')
	72	manifest_url = stream_info['manifest']
	73	return self._extract_m3u8_formats_and_subtitles(manifest_url, slug)
359df0fc HH	74
359df0fc HH	75	def _build_video_info(self, episode):
d50ea3ce	76	fmts, subs = self._fetch_video_formats(episode['slug'])
359df0fc	77	channel_slug = episode['channel_slug']
d50ea3ce	78	channel_title = episode['channel_title']
359df0fc HH	79	return {
	80	'id': episode['zype_id'],
	81	'display_id': episode['slug'],
d50ea3ce HH	82	'formats': fmts,
	83	'subtitles': subs,
	84	'webpage_url': f'https://nebula.tv/{episode["slug"]}',
359df0fc HH	85	'title': episode['title'],
	86	'description': episode['description'],
	87	'timestamp': parse_iso8601(episode['published_at']),
	88	'thumbnails': [{
	89	# 'id': tn.get('name'), # this appears to be null
	90	'url': tn['original'],
	91	'height': key,
	92	} for key, tn in episode['assets']['thumbnail'].items()],
	93	'duration': episode['duration'],
d50ea3ce	94	'channel': channel_title,
359df0fc	95	'channel_id': channel_slug,
d50ea3ce HH	96	'channel_url': f'https://nebula.tv/{channel_slug}',
d50ea3ce HH	97	'uploader': channel_title,
359df0fc	98	'uploader_id': channel_slug,
d50ea3ce HH	99	'uploader_url': f'https://nebula.tv/{channel_slug}',
	100	'series': channel_title,
	101	'creator': channel_title,
359df0fc HH	102	}
359df0fc HH	103
52efa4b3	104	def _perform_login(self, username=None, password=None):
d50ea3ce	105	self._nebula_api_token = self._perform_nebula_auth(username, password)
359df0fc	106	self._nebula_bearer_token = self._fetch_nebula_bearer_token()
359df0fc	107
359df0fc HH	108
359df0fc HH	109	class NebulaIE(NebulaBaseIE):
4cca2eb1	110	_VALID_URL = rf'{_BASE_URL_RE}/videos/(?P<id>[-\w]+)'
bdc196a4 GS	111	_TESTS = [
bdc196a4 GS	112	{
d50ea3ce	113	'url': 'https://nebula.tv/videos/that-time-disney-remade-beauty-and-the-beast',
f3b3fe16	114	'md5': '14944cfee8c7beeea106320c47560efc',
bdc196a4 GS	115	'info_dict': {
	116	'id': '5c271b40b13fd613090034fd',
	117	'ext': 'mp4',
	118	'title': 'That Time Disney Remade Beauty and the Beast',
	119	'description': 'Note: this video was originally posted on YouTube with the sponsor read included. We weren’t able to remove it without reducing video quality, so it’s presented here in its original context.',
	120	'upload_date': '20180731',
	121	'timestamp': 1533009600,
	122	'channel': 'Lindsay Ellis',
359df0fc	123	'channel_id': 'lindsayellis',
bdc196a4	124	'uploader': 'Lindsay Ellis',
359df0fc	125	'uploader_id': 'lindsayellis',
f3b3fe16	126	'timestamp': 1533009600,
d50ea3ce	127	'uploader_url': 'https://nebula.tv/lindsayellis',
f3b3fe16	128	'series': 'Lindsay Ellis',
f3b3fe16	129	'display_id': 'that-time-disney-remade-beauty-and-the-beast',
d50ea3ce	130	'channel_url': 'https://nebula.tv/lindsayellis',
f3b3fe16 HH	131	'creator': 'Lindsay Ellis',
f3b3fe16 HH	132	'duration': 2212,
f3b3fe16	133	'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+\.jpeg?.*',
bdc196a4	134	},
bdc196a4 GS	135	},
bdc196a4 GS	136	{
d50ea3ce	137	'url': 'https://nebula.tv/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore',
f3b3fe16	138	'md5': 'd05739cf6c38c09322422f696b569c23',
bdc196a4 GS	139	'info_dict': {
	140	'id': '5e7e78171aaf320001fbd6be',
	141	'ext': 'mp4',
	142	'title': 'Landing Craft - How The Allies Got Ashore',
	143	'description': r're:^In this episode we explore the unsung heroes of D-Day, the landing craft.',
	144	'upload_date': '20200327',
	145	'timestamp': 1585348140,
359df0fc HH	146	'channel': 'Real Engineering',
	147	'channel_id': 'realengineering',
	148	'uploader': 'Real Engineering',
	149	'uploader_id': 'realengineering',
f3b3fe16	150	'series': 'Real Engineering',
f3b3fe16 HH	151	'display_id': 'the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore',
	152	'creator': 'Real Engineering',
	153	'duration': 841,
d50ea3ce HH	154	'channel_url': 'https://nebula.tv/realengineering',
d50ea3ce HH	155	'uploader_url': 'https://nebula.tv/realengineering',
f3b3fe16	156	'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+\.jpeg?.*',
bdc196a4	157	},
bdc196a4 GS	158	},
bdc196a4 GS	159	{
d50ea3ce	160	'url': 'https://nebula.tv/videos/money-episode-1-the-draw',
f3b3fe16	161	'md5': 'ebe28a7ad822b9ee172387d860487868',
bdc196a4 GS	162	'info_dict': {
	163	'id': '5e779ebdd157bc0001d1c75a',
	164	'ext': 'mp4',
	165	'title': 'Episode 1: The Draw',
	166	'description': r'contains:There’s free money on offer… if the players can all work together.',
	167	'upload_date': '20200323',
	168	'timestamp': 1584980400,
	169	'channel': 'Tom Scott Presents: Money',
359df0fc	170	'channel_id': 'tom-scott-presents-money',
bdc196a4	171	'uploader': 'Tom Scott Presents: Money',
359df0fc	172	'uploader_id': 'tom-scott-presents-money',
d50ea3ce	173	'uploader_url': 'https://nebula.tv/tom-scott-presents-money',
f3b3fe16	174	'duration': 825,
d50ea3ce	175	'channel_url': 'https://nebula.tv/tom-scott-presents-money',
f3b3fe16 HH	176	'series': 'Tom Scott Presents: Money',
	177	'display_id': 'money-episode-1-the-draw',
	178	'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+\.jpeg?.*',
f3b3fe16	179	'creator': 'Tom Scott Presents: Money',
bdc196a4	180	},
bdc196a4 GS	181	},
	182	{
	183	'url': 'https://watchnebula.com/videos/money-episode-1-the-draw',
	184	'only_matching': True,
	185	},
	186	]
bdc196a4	187
359df0fc HH	188	def _fetch_video_metadata(self, slug):
	189	return self._call_nebula_api(f'https://content.watchnebula.com/video/{slug}/',
	190	video_id=slug,
	191	auth_type='bearer',
	192	note='Fetching video meta data')
bdc196a4	193
359df0fc HH	194	def _real_extract(self, url):
	195	slug = self._match_id(url)
	196	video = self._fetch_video_metadata(slug)
	197	return self._build_video_info(video)
bdc196a4	198
bdc196a4	199
f3b3fe16 HH	200	class NebulaSubscriptionsIE(NebulaBaseIE):
f3b3fe16 HH	201	IE_NAME = 'nebula:subscriptions'
4cca2eb1	202	_VALID_URL = rf'{_BASE_URL_RE}/myshows'
f3b3fe16 HH	203	_TESTS = [
f3b3fe16 HH	204	{
d50ea3ce	205	'url': 'https://nebula.tv/myshows',
f3b3fe16 HH	206	'playlist_mincount': 1,
	207	'info_dict': {
	208	'id': 'myshows',
	209	},
	210	},
	211	]
	212
	213	def _generate_playlist_entries(self):
	214	next_url = 'https://content.watchnebula.com/library/video/?page_size=100'
	215	page_num = 1
	216	while next_url:
	217	channel = self._call_nebula_api(next_url, 'myshows', auth_type='bearer',
	218	note=f'Retrieving subscriptions page {page_num}')
	219	for episode in channel['results']:
	220	yield self._build_video_info(episode)
	221	next_url = channel['next']
	222	page_num += 1
	223
	224	def _real_extract(self, url):
	225	return self.playlist_result(self._generate_playlist_entries(), 'myshows')
	226
	227
	228	class NebulaChannelIE(NebulaBaseIE):
	229	IE_NAME = 'nebula:channel'
4cca2eb1	230	_VALID_URL = rf'{_BASE_URL_RE}/(?!myshows\|videos/)(?P<id>[-\w]+)'
359df0fc HH	231	_TESTS = [
359df0fc HH	232	{
d50ea3ce	233	'url': 'https://nebula.tv/tom-scott-presents-money',
359df0fc HH	234	'info_dict': {
	235	'id': 'tom-scott-presents-money',
	236	'title': 'Tom Scott Presents: Money',
	237	'description': 'Tom Scott hosts a series all about trust, negotiation and money.',
bdc196a4	238	},
359df0fc	239	'playlist_count': 5,
359df0fc	240	}, {
d50ea3ce	241	'url': 'https://nebula.tv/lindsayellis',
359df0fc HH	242	'info_dict': {
	243	'id': 'lindsayellis',
	244	'title': 'Lindsay Ellis',
	245	'description': 'Enjoy these hottest of takes on Disney, Transformers, and Musicals.',
	246	},
d50ea3ce	247	'playlist_mincount': 2,
359df0fc HH	248	},
359df0fc HH	249	]
bdc196a4	250
359df0fc HH	251	def _generate_playlist_entries(self, collection_id, channel):
	252	episodes = channel['episodes']['results']
	253	for page_num in itertools.count(2):
	254	for episode in episodes:
	255	yield self._build_video_info(episode)
	256	next_url = channel['episodes']['next']
	257	if not next_url:
	258	break
	259	channel = self._call_nebula_api(next_url, collection_id, auth_type='bearer',
	260	note=f'Retrieving channel page {page_num}')
	261	episodes = channel['episodes']['results']
bdc196a4 GS	262
bdc196a4 GS	263	def _real_extract(self, url):
359df0fc HH	264	collection_id = self._match_id(url)
	265	channel_url = f'https://content.watchnebula.com/video/channels/{collection_id}/'
	266	channel = self._call_nebula_api(channel_url, collection_id, auth_type='bearer', note='Retrieving channel')
	267	channel_details = channel['details']
bdc196a4	268
359df0fc HH	269	return self.playlist_result(
	270	entries=self._generate_playlist_entries(collection_id, channel),
	271	playlist_id=collection_id,
	272	playlist_title=channel_details['title'],
	273	playlist_description=channel_details['description']
	274	)