[yt-dlp.git] / youtube_dl / extractor / nexx.py

# coding: utf-8
from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
    int_or_none,
    parse_duration,
    try_get,
)


class NexxIE(InfoExtractor):
    _VALID_URL = r'''(?x)
                        (?:
                            https?://api\.nexx(?:\.cloud|cdn\.com)/v3/\d+/videos/byid/|
                            nexx:(?:\d+:)?
                        )
                        (?P<id>\d+)
                    '''
    _TESTS = [{
        # movie
        'url': 'https://api.nexx.cloud/v3/748/videos/byid/128907',
        'md5': '828cea195be04e66057b846288295ba1',
        'info_dict': {
            'id': '128907',
            'ext': 'mp4',
            'title': 'Stiftung Warentest',
            'alt_title': 'Wie ein Test abläuft',
            'description': 'md5:d1ddb1ef63de721132abd38639cc2fd2',
            'release_year': 2013,
            'creator': 'SPIEGEL TV',
            'thumbnail': r're:^https?://.*\.jpg$',
            'duration': 2509,
            'timestamp': 1384264416,
            'upload_date': '20131112',
        },
    }, {
        # episode
        'url': 'https://api.nexx.cloud/v3/741/videos/byid/247858',
        'info_dict': {
            'id': '247858',
            'ext': 'mp4',
            'title': 'Return of the Golden Child (OV)',
            'description': 'md5:5d969537509a92b733de21bae249dc63',
            'release_year': 2017,
            'thumbnail': r're:^https?://.*\.jpg$',
            'duration': 1397,
            'timestamp': 1495033267,
            'upload_date': '20170517',
            'episode_number': 2,
            'season_number': 2,
        },
        'params': {
            'skip_download': True,
        },
    }, {
        'url': 'https://api.nexxcdn.com/v3/748/videos/byid/128907',
        'only_matching': True,
    }, {
        'url': 'nexx:748:128907',
        'only_matching': True,
    }, {
        'url': 'nexx:128907',
        'only_matching': True,
    }]

    @staticmethod
    def _extract_domain_id(webpage):
        mobj = re.search(
            r'<script\b[^>]+\bsrc=["\'](?:https?:)?//require\.nexx(?:\.cloud|cdn\.com)/(?P<id>\d+)',
            webpage)
        return mobj.group('id') if mobj else None

    @staticmethod
    def _extract_urls(webpage):
        # Reference:
        # 1. https://nx-s.akamaized.net/files/201510/44.pdf

        entries = []

        # JavaScript Integration
        domain_id = NexxIE._extract_domain_id(webpage)
        if domain_id:
            for video_id in re.findall(
                    r'(?is)onPLAYReady.+?_play\.init\s*\(.+?\s*,\s*["\']?(\d+)',
                    webpage):
                entries.append(
                    'https://api.nexx.cloud/v3/%s/videos/byid/%s'
                    % (domain_id, video_id))

        # TODO: support more embed formats

        return entries

    @staticmethod
    def _extract_url(webpage):
        return NexxIE._extract_urls(webpage)[0]

    def _real_extract(self, url):
        video_id = self._match_id(url)

        video = self._download_json(
            'https://arc.nexx.cloud/api/video/%s.json' % video_id,
            video_id)['result']

        general = video['general']
        title = general['title']

        stream_data = video['streamdata']
        language = general.get('language_raw') or ''

        # TODO: reverse more cdns

        cdn = stream_data['cdnType']
        assert cdn == 'azure'

        azure_locator = stream_data['azureLocator']

        AZURE_URL = 'http://nx%s%02d.akamaized.net/'

        def get_cdn_shield_base(shield_type='', prefix='-p'):
            for secure in ('', 's'):
                cdn_shield = stream_data.get('cdnShield%sHTTP%s' % (shield_type, secure.upper()))
                if cdn_shield:
                    return 'http%s://%s' % (secure, cdn_shield)
            else:
                return AZURE_URL % (prefix, int(stream_data['azureAccount'].replace('nexxplayplus', '')))

        azure_stream_base = get_cdn_shield_base()
        is_ml = ',' in language
        azure_manifest_url = '%s%s/%s_src%s.ism/Manifest' % (
            azure_stream_base, azure_locator, video_id, ('_manifest' if is_ml else '')) + '%s'

        protection_token = try_get(
            video, lambda x: x['protectiondata']['token'], compat_str)
        if protection_token:
            azure_manifest_url += '?hdnts=%s' % protection_token

        formats = self._extract_m3u8_formats(
            azure_manifest_url % '(format=m3u8-aapl)',
            video_id, 'mp4', 'm3u8_native',
            m3u8_id='%s-hls' % cdn, fatal=False)
        formats.extend(self._extract_mpd_formats(
            azure_manifest_url % '(format=mpd-time-csf)',
            video_id, mpd_id='%s-dash' % cdn, fatal=False))
        formats.extend(self._extract_ism_formats(
            azure_manifest_url % '', video_id, ism_id='%s-mss' % cdn, fatal=False))

        azure_progressive_base = get_cdn_shield_base('Prog', '-d')
        azure_file_distribution = stream_data.get('azureFileDistribution')
        if azure_file_distribution:
            fds = azure_file_distribution.split(',')
            if fds:
                for fd in fds:
                    ss = fd.split(':')
                    if len(ss) == 2:
                        tbr = int_or_none(ss[0])
                        if tbr:
                            f = {
                                'url': '%s%s/%s_src_%s_%d.mp4' % (
                                    azure_progressive_base, azure_locator, video_id, ss[1], tbr),
                                'format_id': '%s-http-%d' % (cdn, tbr),
                                'tbr': tbr,
                            }
                            width_height = ss[1].split('x')
                            if len(width_height) == 2:
                                f.update({
                                    'width': int_or_none(width_height[0]),
                                    'height': int_or_none(width_height[1]),
                                })
                            formats.append(f)

        self._sort_formats(formats)

        return {
            'id': video_id,
            'title': title,
            'alt_title': general.get('subtitle'),
            'description': general.get('description'),
            'release_year': int_or_none(general.get('year')),
            'creator': general.get('studio') or general.get('studio_adref'),
            'thumbnail': try_get(
                video, lambda x: x['imagedata']['thumb'], compat_str),
            'duration': parse_duration(general.get('runtime')),
            'timestamp': int_or_none(general.get('uploaded')),
            'episode_number': int_or_none(try_get(
                video, lambda x: x['episodedata']['episode'])),
            'season_number': int_or_none(try_get(
                video, lambda x: x['episodedata']['season'])),
            'formats': formats,
        }


class NexxEmbedIE(InfoExtractor):
    _VALID_URL = r'https?://embed\.nexx(?:\.cloud|cdn\.com)/\d+/(?P<id>[^/?#&]+)'
    _TEST = {
        'url': 'http://embed.nexx.cloud/748/KC1614647Z27Y7T?autoplay=1',
        'md5': '16746bfc28c42049492385c989b26c4a',
        'info_dict': {
            'id': '161464',
            'ext': 'mp4',
            'title': 'Nervenkitzel Achterbahn',
            'alt_title': 'Karussellbauer in Deutschland',
            'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc',
            'release_year': 2005,
            'creator': 'SPIEGEL TV',
            'thumbnail': r're:^https?://.*\.jpg$',
            'duration': 2761,
            'timestamp': 1394021479,
            'upload_date': '20140305',
        },
        'params': {
            'format': 'bestvideo',
            'skip_download': True,
        },
    }

    @staticmethod
    def _extract_urls(webpage):
        # Reference:
        # 1. https://nx-s.akamaized.net/files/201510/44.pdf

        # iFrame Embed Integration
        return [mobj.group('url') for mobj in re.finditer(
            r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//embed\.nexx(?:\.cloud|cdn\.com)/\d+/(?:(?!\1).)+)\1',
            webpage)]

    def _real_extract(self, url):
        embed_id = self._match_id(url)

        webpage = self._download_webpage(url, embed_id)

        return self.url_result(NexxIE._extract_url(webpage), ie=NexxIE.ie_key())
Commit	Line	Data
4e826cd9 S	1	# coding: utf-8
	2	from __future__ import unicode_literals
	3
4e826cd9	4	import re
4e826cd9 S	5
	6	from .common import InfoExtractor
	7	from ..compat import compat_str
	8	from ..utils import (
4e826cd9 S	9	int_or_none,
	10	parse_duration,
	11	try_get,
4e826cd9 S	12	)
	13
	14
	15	class NexxIE(InfoExtractor):
694b6154 S	16	_VALID_URL = r'''(?x)
694b6154 S	17	(?:
9dc7ea32 S	18	https?://api\.nexx(?:\.cloud\|cdn\.com)/v3/\d+/videos/byid/\|
9dc7ea32 S	19	nexx:(?:\d+:)?
694b6154 S	20	)
	21	(?P<id>\d+)
	22	'''
4e826cd9 S	23	_TESTS = [{
	24	# movie
	25	'url': 'https://api.nexx.cloud/v3/748/videos/byid/128907',
c0f647a1	26	'md5': '828cea195be04e66057b846288295ba1',
4e826cd9 S	27	'info_dict': {
	28	'id': '128907',
	29	'ext': 'mp4',
	30	'title': 'Stiftung Warentest',
	31	'alt_title': 'Wie ein Test abläuft',
	32	'description': 'md5:d1ddb1ef63de721132abd38639cc2fd2',
	33	'release_year': 2013,
	34	'creator': 'SPIEGEL TV',
	35	'thumbnail': r're:^https?://.*\.jpg$',
	36	'duration': 2509,
	37	'timestamp': 1384264416,
	38	'upload_date': '20131112',
	39	},
4e826cd9 S	40	}, {
	41	# episode
	42	'url': 'https://api.nexx.cloud/v3/741/videos/byid/247858',
	43	'info_dict': {
	44	'id': '247858',
	45	'ext': 'mp4',
	46	'title': 'Return of the Golden Child (OV)',
	47	'description': 'md5:5d969537509a92b733de21bae249dc63',
	48	'release_year': 2017,
	49	'thumbnail': r're:^https?://.*\.jpg$',
	50	'duration': 1397,
	51	'timestamp': 1495033267,
	52	'upload_date': '20170517',
	53	'episode_number': 2,
	54	'season_number': 2,
	55	},
	56	'params': {
4e826cd9 S	57	'skip_download': True,
	58	},
	59	}, {
	60	'url': 'https://api.nexxcdn.com/v3/748/videos/byid/128907',
	61	'only_matching': True,
694b6154 S	62	}, {
	63	'url': 'nexx:748:128907',
	64	'only_matching': True,
9dc7ea32 S	65	}, {
	66	'url': 'nexx:128907',
	67	'only_matching': True,
4e826cd9 S	68	}]
4e826cd9 S	69
694b6154 S	70	@staticmethod
	71	def _extract_domain_id(webpage):
	72	mobj = re.search(
	73	r'<script\b[^>]+\bsrc=["\'](?:https?:)?//require\.nexx(?:\.cloud\|cdn\.com)/(?P<id>\d+)',
	74	webpage)
	75	return mobj.group('id') if mobj else None
	76
4e826cd9 S	77	@staticmethod
	78	def _extract_urls(webpage):
	79	# Reference:
	80	# 1. https://nx-s.akamaized.net/files/201510/44.pdf
	81
	82	entries = []
	83
	84	# JavaScript Integration
694b6154 S	85	domain_id = NexxIE._extract_domain_id(webpage)
694b6154 S	86	if domain_id:
089b97cf S	87	for video_id in re.findall(
	88	r'(?is)onPLAYReady.+?_play\.init\s\(.+?\s,\s*["\']?(\d+)',
	89	webpage):
	90	entries.append(
	91	'https://api.nexx.cloud/v3/%s/videos/byid/%s'
	92	% (domain_id, video_id))
4e826cd9 S	93
	94	# TODO: support more embed formats
	95
	96	return entries
	97
3f59b015 S	98	@staticmethod
	99	def _extract_url(webpage):
	100	return NexxIE._extract_urls(webpage)[0]
	101
4e826cd9	102	def _real_extract(self, url):
e231afb1	103	video_id = self._match_id(url)
4e826cd9	104
e231afb1 RA	105	video = self._download_json(
	106	'https://arc.nexx.cloud/api/video/%s.json' % video_id,
	107	video_id)['result']
4e826cd9 S	108
	109	general = video['general']
	110	title = general['title']
	111
	112	stream_data = video['streamdata']
	113	language = general.get('language_raw') or ''
	114
c0f647a1	115	# TODO: reverse more cdns
4e826cd9 S	116
	117	cdn = stream_data['cdnType']
	118	assert cdn == 'azure'
	119
	120	azure_locator = stream_data['azureLocator']
	121
c0f647a1	122	AZURE_URL = 'http://nx%s%02d.akamaized.net/'
4e826cd9	123
b485d5d6	124	def get_cdn_shield_base(shield_type='', prefix='-p'):
c0f647a1	125	for secure in ('', 's'):
b485d5d6	126	cdn_shield = stream_data.get('cdnShield%sHTTP%s' % (shield_type, secure.upper()))
c0f647a1	127	if cdn_shield:
b485d5d6	128	return 'http%s://%s' % (secure, cdn_shield)
c0f647a1	129	else:
b485d5d6	130	return AZURE_URL % (prefix, int(stream_data['azureAccount'].replace('nexxplayplus', '')))
4e826cd9	131
b485d5d6	132	azure_stream_base = get_cdn_shield_base()
4e826cd9	133	is_ml = ',' in language
c0f647a1 RA	134	azure_manifest_url = '%s%s/%s_src%s.ism/Manifest' % (
c0f647a1 RA	135	azure_stream_base, azure_locator, video_id, ('_manifest' if is_ml else '')) + '%s'
4e826cd9 S	136
	137	protection_token = try_get(
	138	video, lambda x: x['protectiondata']['token'], compat_str)
	139	if protection_token:
c0f647a1	140	azure_manifest_url += '?hdnts=%s' % protection_token
4e826cd9 S	141
4e826cd9 S	142	formats = self._extract_m3u8_formats(
c0f647a1 RA	143	azure_manifest_url % '(format=m3u8-aapl)',
	144	video_id, 'mp4', 'm3u8_native',
	145	m3u8_id='%s-hls' % cdn, fatal=False)
	146	formats.extend(self._extract_mpd_formats(
	147	azure_manifest_url % '(format=mpd-time-csf)',
	148	video_id, mpd_id='%s-dash' % cdn, fatal=False))
	149	formats.extend(self._extract_ism_formats(
	150	azure_manifest_url % '', video_id, ism_id='%s-mss' % cdn, fatal=False))
	151
b485d5d6	152	azure_progressive_base = get_cdn_shield_base('Prog', '-d')
c0f647a1 RA	153	azure_file_distribution = stream_data.get('azureFileDistribution')
	154	if azure_file_distribution:
	155	fds = azure_file_distribution.split(',')
	156	if fds:
	157	for fd in fds:
	158	ss = fd.split(':')
	159	if len(ss) == 2:
	160	tbr = int_or_none(ss[0])
	161	if tbr:
	162	f = {
	163	'url': '%s%s/%s_src_%s_%d.mp4' % (
	164	azure_progressive_base, azure_locator, video_id, ss[1], tbr),
b485d5d6	165	'format_id': '%s-http-%d' % (cdn, tbr),
c0f647a1 RA	166	'tbr': tbr,
	167	}
	168	width_height = ss[1].split('x')
	169	if len(width_height) == 2:
	170	f.update({
	171	'width': int_or_none(width_height[0]),
	172	'height': int_or_none(width_height[1]),
	173	})
	174	formats.append(f)
	175
4e826cd9 S	176	self._sort_formats(formats)
	177
	178	return {
	179	'id': video_id,
	180	'title': title,
	181	'alt_title': general.get('subtitle'),
	182	'description': general.get('description'),
	183	'release_year': int_or_none(general.get('year')),
	184	'creator': general.get('studio') or general.get('studio_adref'),
	185	'thumbnail': try_get(
	186	video, lambda x: x['imagedata']['thumb'], compat_str),
	187	'duration': parse_duration(general.get('runtime')),
	188	'timestamp': int_or_none(general.get('uploaded')),
	189	'episode_number': int_or_none(try_get(
	190	video, lambda x: x['episodedata']['episode'])),
	191	'season_number': int_or_none(try_get(
	192	video, lambda x: x['episodedata']['season'])),
	193	'formats': formats,
	194	}
3f59b015 S	195
	196
	197	class NexxEmbedIE(InfoExtractor):
	198	_VALID_URL = r'https?://embed\.nexx(?:\.cloud\|cdn\.com)/\d+/(?P<id>[^/?#&]+)'
	199	_TEST = {
	200	'url': 'http://embed.nexx.cloud/748/KC1614647Z27Y7T?autoplay=1',
	201	'md5': '16746bfc28c42049492385c989b26c4a',
	202	'info_dict': {
	203	'id': '161464',
	204	'ext': 'mp4',
	205	'title': 'Nervenkitzel Achterbahn',
	206	'alt_title': 'Karussellbauer in Deutschland',
	207	'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc',
	208	'release_year': 2005,
	209	'creator': 'SPIEGEL TV',
	210	'thumbnail': r're:^https?://.*\.jpg$',
	211	'duration': 2761,
	212	'timestamp': 1394021479,
	213	'upload_date': '20140305',
	214	},
	215	'params': {
	216	'format': 'bestvideo',
	217	'skip_download': True,
	218	},
	219	}
	220
	221	@staticmethod
	222	def _extract_urls(webpage):
	223	# Reference:
	224	# 1. https://nx-s.akamaized.net/files/201510/44.pdf
	225
	226	# iFrame Embed Integration
	227	return [mobj.group('url') for mobj in re.finditer(
13eb526f	228	r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//embed\.nexx(?:\.cloud\|cdn\.com)/\d+/(?:(?!\1).)+)\1',
3f59b015 S	229	webpage)]
	230
	231	def _real_extract(self, url):
	232	embed_id = self._match_id(url)
	233
	234	webpage = self._download_webpage(url, embed_id)
	235
	236	return self.url_result(NexxIE._extract_url(webpage), ie=NexxIE.ie_key())