[yt-dlp.git] / yt_dlp / extractor / nebula.py

# coding: utf-8\r
from __future__ import unicode_literals\r
\r
import json\r
import time\r
\r
from urllib.error import HTTPError\r
from .common import InfoExtractor\r
from ..compat import compat_str, compat_urllib_parse_unquote, compat_urllib_parse_quote\r
from ..utils import (\r
    ExtractorError,\r
    parse_iso8601,\r
    try_get,\r
    urljoin,\r
)\r
\r
\r
class NebulaIE(InfoExtractor):\r
\r
    _VALID_URL = r'https?://(?:www\.)?(?:watchnebula\.com|nebula\.app)/videos/(?P<id>[-\w]+)'\r
    _TESTS = [\r
        {\r
            'url': 'https://nebula.app/videos/that-time-disney-remade-beauty-and-the-beast',\r
            'md5': 'fe79c4df8b3aa2fea98a93d027465c7e',\r
            'info_dict': {\r
                'id': '5c271b40b13fd613090034fd',\r
                'ext': 'mp4',\r
                'title': 'That Time Disney Remade Beauty and the Beast',\r
                'description': 'Note: this video was originally posted on YouTube with the sponsor read included. We weren’t able to remove it without reducing video quality, so it’s presented here in its original context.',\r
                'upload_date': '20180731',\r
                'timestamp': 1533009600,\r
                'channel': 'Lindsay Ellis',\r
                'uploader': 'Lindsay Ellis',\r
            },\r
            'params': {\r
                'usenetrc': True,\r
            },\r
            'skip': 'All Nebula content requires authentication',\r
        },\r
        {\r
            'url': 'https://nebula.app/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore',\r
            'md5': '6d4edd14ce65720fa63aba5c583fb328',\r
            'info_dict': {\r
                'id': '5e7e78171aaf320001fbd6be',\r
                'ext': 'mp4',\r
                'title': 'Landing Craft - How The Allies Got Ashore',\r
                'description': r're:^In this episode we explore the unsung heroes of D-Day, the landing craft.',\r
                'upload_date': '20200327',\r
                'timestamp': 1585348140,\r
                'channel': 'The Logistics of D-Day',\r
                'uploader': 'The Logistics of D-Day',\r
            },\r
            'params': {\r
                'usenetrc': True,\r
            },\r
            'skip': 'All Nebula content requires authentication',\r
        },\r
        {\r
            'url': 'https://nebula.app/videos/money-episode-1-the-draw',\r
            'md5': '8c7d272910eea320f6f8e6d3084eecf5',\r
            'info_dict': {\r
                'id': '5e779ebdd157bc0001d1c75a',\r
                'ext': 'mp4',\r
                'title': 'Episode 1: The Draw',\r
                'description': r'contains:There’s free money on offer… if the players can all work together.',\r
                'upload_date': '20200323',\r
                'timestamp': 1584980400,\r
                'channel': 'Tom Scott Presents: Money',\r
                'uploader': 'Tom Scott Presents: Money',\r
            },\r
            'params': {\r
                'usenetrc': True,\r
            },\r
            'skip': 'All Nebula content requires authentication',\r
        },\r
        {\r
            'url': 'https://watchnebula.com/videos/money-episode-1-the-draw',\r
            'only_matching': True,\r
        },\r
    ]\r
    _NETRC_MACHINE = 'watchnebula'\r
\r
    _nebula_token = None\r
\r
    def _retrieve_nebula_auth(self):\r
        """\r
        Log in to Nebula, and returns a Nebula API token\r
        """\r
\r
        username, password = self._get_login_info()\r
        if not (username and password):\r
            self.raise_login_required()\r
\r
        self.report_login()\r
        data = json.dumps({'email': username, 'password': password}).encode('utf8')\r
        response = self._download_json(\r
            'https://api.watchnebula.com/api/v1/auth/login/',\r
            data=data, fatal=False, video_id=None,\r
            headers={\r
                'content-type': 'application/json',\r
                # Submitting the 'sessionid' cookie always causes a 403 on auth endpoint\r
                'cookie': ''\r
            },\r
            note='Authenticating to Nebula with supplied credentials',\r
            errnote='Authentication failed or rejected')\r
        if not response or not response.get('key'):\r
            self.raise_login_required()\r
\r
        # save nebula token as cookie\r
        self._set_cookie(\r
            'nebula.app', 'nebula-auth',\r
            compat_urllib_parse_quote(\r
                json.dumps({\r
                    "apiToken": response["key"],\r
                    "isLoggingIn": False,\r
                    "isLoggingOut": False,\r
                }, separators=(",", ":"))),\r
            expire_time=int(time.time()) + 86400 * 365,\r
        )\r
\r
        return response['key']\r
\r
    def _retrieve_zype_api_key(self, page_url, display_id):\r
        """\r
        Retrieves the Zype API key\r
        """\r
\r
        # Find the js that has the API key from the webpage and download it\r
        webpage = self._download_webpage(page_url, video_id=display_id)\r
        main_script_relpath = self._search_regex(\r
            r'<script[^>]*src="(?P<script_relpath>[^"]*main.[0-9a-f]*.chunk.js)"[^>]*>', webpage,\r
            group='script_relpath', name='script relative path', fatal=True)\r
        main_script_abspath = urljoin(page_url, main_script_relpath)\r
        main_script = self._download_webpage(main_script_abspath, video_id=display_id,\r
                                             note='Retrieving Zype API key')\r
\r
        api_key = self._search_regex(\r
            r'REACT_APP_ZYPE_API_KEY\s*:\s*"(?P<api_key>[\w-]*)"', main_script,\r
            group='api_key', name='API key', fatal=True)\r
\r
        return api_key\r
\r
    def _call_zype_api(self, path, params, video_id, api_key, note):\r
        """\r
        A helper for making calls to the Zype API.\r
        """\r
        query = {'api_key': api_key, 'per_page': 1}\r
        query.update(params)\r
        return self._download_json('https://api.zype.com' + path, video_id, query=query, note=note)\r
\r
    def _call_nebula_api(self, path, video_id, access_token, note):\r
        """\r
        A helper for making calls to the Nebula API.\r
        """\r
        return self._download_json('https://api.watchnebula.com/api/v1' + path, video_id, headers={\r
            'Authorization': 'Token {access_token}'.format(access_token=access_token)\r
        }, note=note)\r
\r
    def _fetch_zype_access_token(self, video_id):\r
        try:\r
            user_object = self._call_nebula_api('/auth/user/', video_id, self._nebula_token, note='Retrieving Zype access token')\r
        except ExtractorError as exc:\r
            # if 401, attempt credential auth and retry\r
            if exc.cause and isinstance(exc.cause, HTTPError) and exc.cause.code == 401:\r
                self._nebula_token = self._retrieve_nebula_auth()\r
                user_object = self._call_nebula_api('/auth/user/', video_id, self._nebula_token, note='Retrieving Zype access token')\r
            else:\r
                raise\r
\r
        access_token = try_get(user_object, lambda x: x['zype_auth_info']['access_token'], compat_str)\r
        if not access_token:\r
            if try_get(user_object, lambda x: x['is_subscribed'], bool):\r
                # TODO: Reimplement the same Zype token polling the Nebula frontend implements\r
                # see https://github.com/ytdl-org/youtube-dl/pull/24805#issuecomment-749231532\r
                raise ExtractorError(\r
                    'Unable to extract Zype access token from Nebula API authentication endpoint. '\r
                    'Open an arbitrary video in a browser with this account to generate a token',\r
                    expected=True)\r
            raise ExtractorError('Unable to extract Zype access token from Nebula API authentication endpoint')\r
        return access_token\r
\r
    def _extract_channel_title(self, video_meta):\r
        # TODO: Implement the API calls giving us the channel list,\r
        # so that we can do the title lookup and then figure out the channel URL\r
        categories = video_meta.get('categories', []) if video_meta else []\r
        # the channel name is the value of the first category\r
        for category in categories:\r
            if category.get('value'):\r
                return category['value'][0]\r
\r
    def _real_initialize(self):\r
        # check cookie jar for valid token\r
        nebula_cookies = self._get_cookies('https://nebula.app')\r
        nebula_cookie = nebula_cookies.get('nebula-auth')\r
        if nebula_cookie:\r
            self.to_screen('Authenticating to Nebula with token from cookie jar')\r
            nebula_cookie_value = compat_urllib_parse_unquote(nebula_cookie.value)\r
            self._nebula_token = self._parse_json(nebula_cookie_value, None).get('apiToken')\r
\r
        # try to authenticate using credentials if no valid token has been found\r
        if not self._nebula_token:\r
            self._nebula_token = self._retrieve_nebula_auth()\r
\r
    def _real_extract(self, url):\r
        display_id = self._match_id(url)\r
        api_key = self._retrieve_zype_api_key(url, display_id)\r
\r
        response = self._call_zype_api('/videos', {'friendly_title': display_id},\r
                                       display_id, api_key, note='Retrieving metadata from Zype')\r
        if len(response.get('response') or []) != 1:\r
            raise ExtractorError('Unable to find video on Zype API')\r
        video_meta = response['response'][0]\r
\r
        video_id = video_meta['_id']\r
        zype_access_token = self._fetch_zype_access_token(display_id)\r
\r
        channel_title = self._extract_channel_title(video_meta)\r
\r
        return {\r
            'id': video_id,\r
            'display_id': display_id,\r
            '_type': 'url_transparent',\r
            'ie_key': 'Zype',\r
            'url': 'https://player.zype.com/embed/%s.html?access_token=%s' % (video_id, zype_access_token),\r
            'title': video_meta.get('title'),\r
            'description': video_meta.get('description'),\r
            'timestamp': parse_iso8601(video_meta.get('published_at')),\r
            'thumbnails': [{\r
                'id': tn.get('name'),  # this appears to be null\r
                'url': tn['url'],\r
                'width': tn.get('width'),\r
                'height': tn.get('height'),\r
            } for tn in video_meta.get('thumbnails', [])],\r
            'duration': video_meta.get('duration'),\r
            'channel': channel_title,\r
            'uploader': channel_title,  # we chose uploader = channel name\r
            # TODO: uploader_url, channel_id, channel_url\r
        }\r
Commit	Line	Data
f1823403 HH	1	# coding: utf-8\r
	2	from __future__ import unicode_literals\r
	3	\r
	4	import json\r
145bd631	5	import time\r
f1823403	6	\r
145bd631	7	from urllib.error import HTTPError\r
f1823403	8	from .common import InfoExtractor\r
145bd631	9	from ..compat import compat_str, compat_urllib_parse_unquote, compat_urllib_parse_quote\r
f1823403 HH	10	from ..utils import (\r
	11	ExtractorError,\r
	12	parse_iso8601,\r
	13	try_get,\r
	14	urljoin,\r
	15	)\r
	16	\r
	17	\r
	18	class NebulaIE(InfoExtractor):\r
	19	\r
1ad047d0	20	_VALID_URL = r'https?://(?:www\.)?(?:watchnebula\.com\|nebula\.app)/videos/(?P<id>[-\w]+)'\r
f1823403 HH	21	_TESTS = [\r
f1823403 HH	22	{\r
1ad047d0	23	'url': 'https://nebula.app/videos/that-time-disney-remade-beauty-and-the-beast',\r
f1823403 HH	24	'md5': 'fe79c4df8b3aa2fea98a93d027465c7e',\r
	25	'info_dict': {\r
	26	'id': '5c271b40b13fd613090034fd',\r
	27	'ext': 'mp4',\r
	28	'title': 'That Time Disney Remade Beauty and the Beast',\r
	29	'description': 'Note: this video was originally posted on YouTube with the sponsor read included. We weren’t able to remove it without reducing video quality, so it’s presented here in its original context.',\r
	30	'upload_date': '20180731',\r
	31	'timestamp': 1533009600,\r
	32	'channel': 'Lindsay Ellis',\r
	33	'uploader': 'Lindsay Ellis',\r
	34	},\r
	35	'params': {\r
	36	'usenetrc': True,\r
	37	},\r
	38	'skip': 'All Nebula content requires authentication',\r
	39	},\r
	40	{\r
1ad047d0	41	'url': 'https://nebula.app/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore',\r
f1823403 HH	42	'md5': '6d4edd14ce65720fa63aba5c583fb328',\r
	43	'info_dict': {\r
	44	'id': '5e7e78171aaf320001fbd6be',\r
	45	'ext': 'mp4',\r
	46	'title': 'Landing Craft - How The Allies Got Ashore',\r
	47	'description': r're:^In this episode we explore the unsung heroes of D-Day, the landing craft.',\r
	48	'upload_date': '20200327',\r
	49	'timestamp': 1585348140,\r
	50	'channel': 'The Logistics of D-Day',\r
	51	'uploader': 'The Logistics of D-Day',\r
	52	},\r
	53	'params': {\r
	54	'usenetrc': True,\r
	55	},\r
	56	'skip': 'All Nebula content requires authentication',\r
	57	},\r
	58	{\r
1ad047d0	59	'url': 'https://nebula.app/videos/money-episode-1-the-draw',\r
f1823403 HH	60	'md5': '8c7d272910eea320f6f8e6d3084eecf5',\r
	61	'info_dict': {\r
	62	'id': '5e779ebdd157bc0001d1c75a',\r
	63	'ext': 'mp4',\r
	64	'title': 'Episode 1: The Draw',\r
	65	'description': r'contains:There’s free money on offer… if the players can all work together.',\r
	66	'upload_date': '20200323',\r
	67	'timestamp': 1584980400,\r
	68	'channel': 'Tom Scott Presents: Money',\r
	69	'uploader': 'Tom Scott Presents: Money',\r
	70	},\r
	71	'params': {\r
	72	'usenetrc': True,\r
	73	},\r
	74	'skip': 'All Nebula content requires authentication',\r
	75	},\r
1ad047d0	76	{\r
	77	'url': 'https://watchnebula.com/videos/money-episode-1-the-draw',\r
	78	'only_matching': True,\r
	79	},\r
f1823403 HH	80	]\r
	81	_NETRC_MACHINE = 'watchnebula'\r
	82	\r
145bd631 HH	83	_nebula_token = None\r
	84	\r
	85	def _retrieve_nebula_auth(self):\r
f1823403 HH	86	"""\r
	87	Log in to Nebula, and returns a Nebula API token\r
	88	"""\r
	89	\r
	90	username, password = self._get_login_info()\r
	91	if not (username and password):\r
	92	self.raise_login_required()\r
	93	\r
	94	self.report_login()\r
	95	data = json.dumps({'email': username, 'password': password}).encode('utf8')\r
	96	response = self._download_json(\r
	97	'https://api.watchnebula.com/api/v1/auth/login/',\r
145bd631	98	data=data, fatal=False, video_id=None,\r
f1823403 HH	99	headers={\r
	100	'content-type': 'application/json',\r
	101	# Submitting the 'sessionid' cookie always causes a 403 on auth endpoint\r
	102	'cookie': ''\r
	103	},\r
	104	note='Authenticating to Nebula with supplied credentials',\r
	105	errnote='Authentication failed or rejected')\r
	106	if not response or not response.get('key'):\r
	107	self.raise_login_required()\r
145bd631 HH	108	\r
	109	# save nebula token as cookie\r
	110	self._set_cookie(\r
	111	'nebula.app', 'nebula-auth',\r
	112	compat_urllib_parse_quote(\r
	113	json.dumps({\r
	114	"apiToken": response["key"],\r
	115	"isLoggingIn": False,\r
	116	"isLoggingOut": False,\r
	117	}, separators=(",", ":"))),\r
	118	expire_time=int(time.time()) + 86400 * 365,\r
	119	)\r
	120	\r
f1823403 HH	121	return response['key']\r
	122	\r
	123	def _retrieve_zype_api_key(self, page_url, display_id):\r
	124	"""\r
	125	Retrieves the Zype API key\r
	126	"""\r
	127	\r
	128	# Find the js that has the API key from the webpage and download it\r
	129	webpage = self._download_webpage(page_url, video_id=display_id)\r
	130	main_script_relpath = self._search_regex(\r
	131	r'<script[^>]src="(?P<script_relpath>[^"]main.[0-9a-f].chunk.js)"[^>]>', webpage,\r
	132	group='script_relpath', name='script relative path', fatal=True)\r
	133	main_script_abspath = urljoin(page_url, main_script_relpath)\r
	134	main_script = self._download_webpage(main_script_abspath, video_id=display_id,\r
	135	note='Retrieving Zype API key')\r
	136	\r
	137	api_key = self._search_regex(\r
	138	r'REACT_APP_ZYPE_API_KEY\s:\s"(?P<api_key>[\w-]*)"', main_script,\r
	139	group='api_key', name='API key', fatal=True)\r
	140	\r
	141	return api_key\r
	142	\r
	143	def _call_zype_api(self, path, params, video_id, api_key, note):\r
	144	"""\r
	145	A helper for making calls to the Zype API.\r
	146	"""\r
	147	query = {'api_key': api_key, 'per_page': 1}\r
	148	query.update(params)\r
	149	return self._download_json('https://api.zype.com' + path, video_id, query=query, note=note)\r
	150	\r
	151	def _call_nebula_api(self, path, video_id, access_token, note):\r
	152	"""\r
	153	A helper for making calls to the Nebula API.\r
	154	"""\r
	155	return self._download_json('https://api.watchnebula.com/api/v1' + path, video_id, headers={\r
	156	'Authorization': 'Token {access_token}'.format(access_token=access_token)\r
	157	}, note=note)\r
	158	\r
145bd631 HH	159	def _fetch_zype_access_token(self, video_id):\r
	160	try:\r
	161	user_object = self._call_nebula_api('/auth/user/', video_id, self._nebula_token, note='Retrieving Zype access token')\r
	162	except ExtractorError as exc:\r
	163	# if 401, attempt credential auth and retry\r
	164	if exc.cause and isinstance(exc.cause, HTTPError) and exc.cause.code == 401:\r
	165	self._nebula_token = self._retrieve_nebula_auth()\r
	166	user_object = self._call_nebula_api('/auth/user/', video_id, self._nebula_token, note='Retrieving Zype access token')\r
	167	else:\r
	168	raise\r
	169	\r
f1823403 HH	170	access_token = try_get(user_object, lambda x: x['zype_auth_info']['access_token'], compat_str)\r
	171	if not access_token:\r
	172	if try_get(user_object, lambda x: x['is_subscribed'], bool):\r
	173	# TODO: Reimplement the same Zype token polling the Nebula frontend implements\r
	174	# see https://github.com/ytdl-org/youtube-dl/pull/24805#issuecomment-749231532\r
	175	raise ExtractorError(\r
	176	'Unable to extract Zype access token from Nebula API authentication endpoint. '\r
	177	'Open an arbitrary video in a browser with this account to generate a token',\r
	178	expected=True)\r
	179	raise ExtractorError('Unable to extract Zype access token from Nebula API authentication endpoint')\r
	180	return access_token\r
	181	\r
	182	def _extract_channel_title(self, video_meta):\r
	183	# TODO: Implement the API calls giving us the channel list,\r
	184	# so that we can do the title lookup and then figure out the channel URL\r
	185	categories = video_meta.get('categories', []) if video_meta else []\r
	186	# the channel name is the value of the first category\r
	187	for category in categories:\r
	188	if category.get('value'):\r
	189	return category['value'][0]\r
	190	\r
145bd631 HH	191	def _real_initialize(self):\r
	192	# check cookie jar for valid token\r
	193	nebula_cookies = self._get_cookies('https://nebula.app')\r
	194	nebula_cookie = nebula_cookies.get('nebula-auth')\r
	195	if nebula_cookie:\r
	196	self.to_screen('Authenticating to Nebula with token from cookie jar')\r
	197	nebula_cookie_value = compat_urllib_parse_unquote(nebula_cookie.value)\r
	198	self._nebula_token = self._parse_json(nebula_cookie_value, None).get('apiToken')\r
	199	\r
	200	# try to authenticate using credentials if no valid token has been found\r
	201	if not self._nebula_token:\r
	202	self._nebula_token = self._retrieve_nebula_auth()\r
	203	\r
f1823403 HH	204	def _real_extract(self, url):\r
f1823403 HH	205	display_id = self._match_id(url)\r
f1823403 HH	206	api_key = self._retrieve_zype_api_key(url, display_id)\r
	207	\r
	208	response = self._call_zype_api('/videos', {'friendly_title': display_id},\r
	209	display_id, api_key, note='Retrieving metadata from Zype')\r
	210	if len(response.get('response') or []) != 1:\r
	211	raise ExtractorError('Unable to find video on Zype API')\r
	212	video_meta = response['response'][0]\r
	213	\r
	214	video_id = video_meta['_id']\r
145bd631	215	zype_access_token = self._fetch_zype_access_token(display_id)\r
f1823403 HH	216	\r
	217	channel_title = self._extract_channel_title(video_meta)\r
	218	\r
	219	return {\r
	220	'id': video_id,\r
	221	'display_id': display_id,\r
	222	'_type': 'url_transparent',\r
	223	'ie_key': 'Zype',\r
	224	'url': 'https://player.zype.com/embed/%s.html?access_token=%s' % (video_id, zype_access_token),\r
	225	'title': video_meta.get('title'),\r
	226	'description': video_meta.get('description'),\r
	227	'timestamp': parse_iso8601(video_meta.get('published_at')),\r
145bd631 HH	228	'thumbnails': [{\r
	229	'id': tn.get('name'), # this appears to be null\r
	230	'url': tn['url'],\r
	231	'width': tn.get('width'),\r
	232	'height': tn.get('height'),\r
	233	} for tn in video_meta.get('thumbnails', [])],\r
f1823403 HH	234	'duration': video_meta.get('duration'),\r
	235	'channel': channel_title,\r
	236	'uploader': channel_title, # we chose uploader = channel name\r
	237	# TODO: uploader_url, channel_id, channel_url\r
	238	}\r