[yt-dlp.git] / yt_dlp / extractor / duboku.py

import base64
import re
import urllib.parse

from .common import InfoExtractor
from ..compat import compat_urlparse
from ..utils import (
    ExtractorError,
    clean_html,
    extract_attributes,
    get_elements_by_class,
    int_or_none,
    js_to_json,
    smuggle_url,
    unescapeHTML,
)


def _get_elements_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
    """Return the content of the tag with the specified attribute in the passed HTML document"""

    if tag is None:
        tag = '[a-zA-Z0-9:._-]+'
    if attribute is None:
        attribute = ''
    else:
        attribute = r'\s+(?P<attribute>%s)' % re.escape(attribute)
    if value is None:
        value = ''
    else:
        value = re.escape(value) if escape_value else value
        value = '=[\'"]?(?P<value>%s)[\'"]?' % value

    retlist = []
    for m in re.finditer(r'''(?xs)
        <(?P<tag>%s)
         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
         %s%s
         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
        \s*>
        (?P<content>.*?)
        </\1>
    ''' % (tag, attribute, value), html):
        retlist.append(m)

    return retlist


def _get_element_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
    retval = _get_elements_by_tag_and_attrib(html, tag, attribute, value, escape_value)
    return retval[0] if retval else None


class DubokuIE(InfoExtractor):
    IE_NAME = 'duboku'
    IE_DESC = 'www.duboku.io'

    _VALID_URL = r'(?:https?://[^/]+\.duboku\.io/vodplay/)(?P<id>[0-9]+-[0-9-]+)\.html.*'
    _TESTS = [{
        'url': 'https://w.duboku.io/vodplay/1575-1-1.html',
        'info_dict': {
            'id': '1575-1-1',
            'ext': 'mp4',
            'series': '白色月光',
            'title': 'contains:白色月光',
            'season_number': 1,
            'episode_number': 1,
            'season': 'Season 1',
            'episode_id': '1',
            'season_id': '1',
            'episode': 'Episode 1',
        },
        'params': {
            'skip_download': 'm3u8 download',
        },
    }, {
        'url': 'https://w.duboku.io/vodplay/1588-1-1.html',
        'info_dict': {
            'id': '1588-1-1',
            'ext': 'mp4',
            'series': '亲爱的自己',
            'title': 'contains:第1集',
            'season_number': 1,
            'episode_number': 1,
            'episode': 'Episode 1',
            'season': 'Season 1',
            'episode_id': '1',
            'season_id': '1',
        },
        'params': {
            'skip_download': 'm3u8 download',
        },
    }]

    _PLAYER_DATA_PATTERN = r'player_data\s*=\s*(\{\s*(.*)})\s*;?\s*</script'

    def _real_extract(self, url):
        video_id = self._match_id(url)
        temp = video_id.split('-')
        series_id = temp[0]
        season_id = temp[1]
        episode_id = temp[2]

        webpage_url = 'https://w.duboku.io/vodplay/%s.html' % video_id
        webpage_html = self._download_webpage(webpage_url, video_id)

        # extract video url

        player_data = self._search_regex(
            self._PLAYER_DATA_PATTERN, webpage_html, 'player_data')
        player_data = self._parse_json(player_data, video_id, js_to_json)

        # extract title

        temp = get_elements_by_class('title', webpage_html)
        series_title = None
        title = None
        for html in temp:
            mobj = re.search(r'<a\s+.*>(.*)</a>', html)
            if mobj:
                href = extract_attributes(mobj.group(0)).get('href')
                if href:
                    mobj1 = re.search(r'/(\d+)\.html', href)
                    if mobj1 and mobj1.group(1) == series_id:
                        series_title = clean_html(mobj.group(0))
                        series_title = re.sub(r'[\s\r\n\t]+', ' ', series_title)
                        title = clean_html(html)
                        title = re.sub(r'[\s\r\n\t]+', ' ', title)
                        break

        data_url = player_data.get('url')
        if not data_url:
            raise ExtractorError('Cannot find url in player_data')
        player_encrypt = player_data.get('encrypt')
        if player_encrypt == 1:
            data_url = urllib.parse.unquote(data_url)
        elif player_encrypt == 2:
            data_url = urllib.parse.unquote(base64.b64decode(data_url).decode('ascii'))

        # if it is an embedded iframe, maybe it's an external source
        headers = {'Referer': webpage_url}
        if player_data.get('from') == 'iframe':
            # use _type url_transparent to retain the meaningful details
            # of the video.
            return {
                '_type': 'url_transparent',
                'url': smuggle_url(data_url, {'referer': webpage_url}),
                'id': video_id,
                'title': title,
                'series': series_title,
                'season_number': int_or_none(season_id),
                'season_id': season_id,
                'episode_number': int_or_none(episode_id),
                'episode_id': episode_id,
            }

        formats = self._extract_m3u8_formats(data_url, video_id, 'mp4', headers=headers)

        return {
            'id': video_id,
            'title': title,
            'series': series_title,
            'season_number': int_or_none(season_id),
            'season_id': season_id,
            'episode_number': int_or_none(episode_id),
            'episode_id': episode_id,
            'formats': formats,
            'http_headers': headers
        }


class DubokuPlaylistIE(InfoExtractor):
    IE_NAME = 'duboku:list'
    IE_DESC = 'www.duboku.io entire series'

    _VALID_URL = r'(?:https?://[^/]+\.duboku\.io/voddetail/)(?P<id>[0-9]+)\.html.*'
    _TESTS = [{
        'url': 'https://w.duboku.io/voddetail/1575.html',
        'info_dict': {
            'id': 'startswith:1575',
            'title': '白色月光',
        },
        'playlist_count': 12,
    }, {
        'url': 'https://w.duboku.io/voddetail/1554.html',
        'info_dict': {
            'id': 'startswith:1554',
            'title': '以家人之名',
        },
        'playlist_mincount': 30,
    }]

    def _real_extract(self, url):
        mobj = self._match_valid_url(url)
        if mobj is None:
            raise ExtractorError('Invalid URL: %s' % url)
        series_id = mobj.group('id')
        fragment = compat_urlparse.urlparse(url).fragment

        webpage_url = 'https://w.duboku.io/voddetail/%s.html' % series_id
        webpage_html = self._download_webpage(webpage_url, series_id)

        # extract title

        title = _get_element_by_tag_and_attrib(webpage_html, 'h1', 'class', 'title')
        title = unescapeHTML(title.group('content')) if title else None
        if not title:
            title = self._html_search_meta('keywords', webpage_html)
        if not title:
            title = _get_element_by_tag_and_attrib(webpage_html, 'title')
            title = unescapeHTML(title.group('content')) if title else None

        # extract playlists

        playlists = {}
        for div in _get_elements_by_tag_and_attrib(
                webpage_html, attribute='id', value='playlist\\d+', escape_value=False):
            playlist_id = div.group('value')
            playlist = []
            for a in _get_elements_by_tag_and_attrib(
                    div.group('content'), 'a', 'href', value='[^\'"]+?', escape_value=False):
                playlist.append({
                    'href': unescapeHTML(a.group('value')),
                    'title': unescapeHTML(a.group('content'))
                })
            playlists[playlist_id] = playlist

        # select the specified playlist if url fragment exists
        playlist = None
        playlist_id = None
        if fragment:
            playlist = playlists.get(fragment)
            playlist_id = fragment
        else:
            first = next(iter(playlists.items()), None)
            if first:
                (playlist_id, playlist) = first
        if not playlist:
            raise ExtractorError(
                'Cannot find %s' % fragment if fragment else 'Cannot extract playlist')

        # return url results
        return self.playlist_result([
            self.url_result(
                compat_urlparse.urljoin('https://w.duboku.io', x['href']),
                ie=DubokuIE.ie_key(), video_title=x.get('title'))
            for x in playlist], series_id + '#' + playlist_id, title)
Commit	Line	Data
d3d4187d	1	import base64
503406d4	2	import re
d3d4187d	3	import urllib.parse
503406d4	4
503406d4	5	from .common import InfoExtractor
de4144a4	6	from ..compat import compat_urlparse
7cc9d5b3	7	from ..utils import (
e897bd82	8	ExtractorError,
7cc9d5b3	9	clean_html,
7cc9d5b3	10	extract_attributes,
7cc9d5b3	11	get_elements_by_class,
	12	int_or_none,
	13	js_to_json,
	14	smuggle_url,
	15	unescapeHTML,
	16	)
503406d4	17
503406d4	18
de4144a4	19	def _get_elements_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
	20	"""Return the content of the tag with the specified attribute in the passed HTML document"""
	21
	22	if tag is None:
	23	tag = '[a-zA-Z0-9:._-]+'
	24	if attribute is None:
	25	attribute = ''
	26	else:
	27	attribute = r'\s+(?P<attribute>%s)' % re.escape(attribute)
	28	if value is None:
	29	value = ''
	30	else:
	31	value = re.escape(value) if escape_value else value
	32	value = '=[\'"]?(?P<value>%s)[\'"]?' % value
	33
	34	retlist = []
	35	for m in re.finditer(r'''(?xs)
	36	<(?P<tag>%s)
	37	(?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]\|="[^"]"\|='[^']'\|))?
	38	%s%s
	39	(?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]\|="[^"]"\|='[^']'\|))?
	40	\s*>
	41	(?P<content>.*?)
	42	</\1>
	43	''' % (tag, attribute, value), html):
	44	retlist.append(m)
	45
	46	return retlist
	47
	48
	49	def _get_element_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
	50	retval = _get_elements_by_tag_and_attrib(html, tag, attribute, value, escape_value)
	51	return retval[0] if retval else None
	52
	53
503406d4	54	class DubokuIE(InfoExtractor):
de4144a4	55	IE_NAME = 'duboku'
5bbe631e	56	IE_DESC = 'www.duboku.io'
de4144a4	57
5bbe631e	58	_VALID_URL = r'(?:https?://[^/]+\.duboku\.io/vodplay/)(?P<id>[0-9]+-[0-9-]+)\.html.*'
503406d4	59	_TESTS = [{
5bbe631e	60	'url': 'https://w.duboku.io/vodplay/1575-1-1.html',
503406d4	61	'info_dict': {
503406d4	62	'id': '1575-1-1',
5bbe631e	63	'ext': 'mp4',
7cc9d5b3	64	'series': '白色月光',
	65	'title': 'contains:白色月光',
	66	'season_number': 1,
	67	'episode_number': 1,
5bbe631e	68	'season': 'Season 1',
	69	'episode_id': '1',
	70	'season_id': '1',
	71	'episode': 'Episode 1',
7cc9d5b3	72	},
	73	'params': {
	74	'skip_download': 'm3u8 download',
	75	},
	76	}, {
5bbe631e	77	'url': 'https://w.duboku.io/vodplay/1588-1-1.html',
7cc9d5b3	78	'info_dict': {
7cc9d5b3	79	'id': '1588-1-1',
5bbe631e	80	'ext': 'mp4',
7cc9d5b3	81	'series': '亲爱的自己',
5bbe631e	82	'title': 'contains:第1集',
7cc9d5b3	83	'season_number': 1,
7cc9d5b3	84	'episode_number': 1,
5bbe631e	85	'episode': 'Episode 1',
	86	'season': 'Season 1',
	87	'episode_id': '1',
	88	'season_id': '1',
503406d4	89	},
	90	'params': {
	91	'skip_download': 'm3u8 download',
	92	},
	93	}]
	94
	95	_PLAYER_DATA_PATTERN = r'player_data\s=\s(\{\s(.)})\s;?\s</script'
	96
	97	def _real_extract(self, url):
	98	video_id = self._match_id(url)
	99	temp = video_id.split('-')
	100	series_id = temp[0]
	101	season_id = temp[1]
	102	episode_id = temp[2]
	103
5bbe631e	104	webpage_url = 'https://w.duboku.io/vodplay/%s.html' % video_id
503406d4	105	webpage_html = self._download_webpage(webpage_url, video_id)
	106
	107	# extract video url
	108
	109	player_data = self._search_regex(
	110	self._PLAYER_DATA_PATTERN, webpage_html, 'player_data')
bf739292	111	player_data = self._parse_json(player_data, video_id, js_to_json)
503406d4	112
	113	# extract title
	114
	115	temp = get_elements_by_class('title', webpage_html)
	116	series_title = None
	117	title = None
	118	for html in temp:
	119	mobj = re.search(r'<a\s+.>(.)</a>', html)
	120	if mobj:
	121	href = extract_attributes(mobj.group(0)).get('href')
	122	if href:
	123	mobj1 = re.search(r'/(\d+)\.html', href)
	124	if mobj1 and mobj1.group(1) == series_id:
	125	series_title = clean_html(mobj.group(0))
	126	series_title = re.sub(r'[\s\r\n\t]+', ' ', series_title)
	127	title = clean_html(html)
	128	title = re.sub(r'[\s\r\n\t]+', ' ', title)
	129	break
	130
bf739292	131	data_url = player_data.get('url')
	132	if not data_url:
	133	raise ExtractorError('Cannot find url in player_data')
d3d4187d D	134	player_encrypt = player_data.get('encrypt')
	135	if player_encrypt == 1:
	136	data_url = urllib.parse.unquote(data_url)
	137	elif player_encrypt == 2:
	138	data_url = urllib.parse.unquote(base64.b64decode(data_url).decode('ascii'))
503406d4	139
503406d4	140	# if it is an embedded iframe, maybe it's an external source
5bbe631e	141	headers = {'Referer': webpage_url}
d3d4187d	142	if player_data.get('from') == 'iframe':
503406d4	143	# use _type url_transparent to retain the meaningful details
	144	# of the video.
	145	return {
	146	'_type': 'url_transparent',
f04b5bed	147	'url': smuggle_url(data_url, {'referer': webpage_url}),
503406d4	148	'id': video_id,
	149	'title': title,
	150	'series': series_title,
	151	'season_number': int_or_none(season_id),
	152	'season_id': season_id,
	153	'episode_number': int_or_none(episode_id),
	154	'episode_id': episode_id,
	155	}
	156
5bbe631e	157	formats = self._extract_m3u8_formats(data_url, video_id, 'mp4', headers=headers)
503406d4	158
	159	return {
	160	'id': video_id,
	161	'title': title,
	162	'series': series_title,
	163	'season_number': int_or_none(season_id),
	164	'season_id': season_id,
	165	'episode_number': int_or_none(episode_id),
	166	'episode_id': episode_id,
	167	'formats': formats,
5bbe631e	168	'http_headers': headers
503406d4	169	}
de4144a4	170
	171
	172	class DubokuPlaylistIE(InfoExtractor):
	173	IE_NAME = 'duboku:list'
5bbe631e	174	IE_DESC = 'www.duboku.io entire series'
de4144a4	175
5bbe631e	176	_VALID_URL = r'(?:https?://[^/]+\.duboku\.io/voddetail/)(?P<id>[0-9]+)\.html.*'
d82b6697	177	_TESTS = [{
5bbe631e	178	'url': 'https://w.duboku.io/voddetail/1575.html',
d82b6697	179	'info_dict': {
7cc9d5b3	180	'id': 'startswith:1575',
d82b6697	181	'title': '白色月光',
	182	},
	183	'playlist_count': 12,
	184	}, {
5bbe631e	185	'url': 'https://w.duboku.io/voddetail/1554.html',
d82b6697	186	'info_dict': {
7cc9d5b3	187	'id': 'startswith:1554',
d82b6697	188	'title': '以家人之名',
d82b6697	189	},
7cc9d5b3	190	'playlist_mincount': 30,
d82b6697	191	}]
de4144a4	192
de4144a4	193	def _real_extract(self, url):
5ad28e7f	194	mobj = self._match_valid_url(url)
de4144a4	195	if mobj is None:
	196	raise ExtractorError('Invalid URL: %s' % url)
	197	series_id = mobj.group('id')
	198	fragment = compat_urlparse.urlparse(url).fragment
	199
5bbe631e	200	webpage_url = 'https://w.duboku.io/voddetail/%s.html' % series_id
de4144a4	201	webpage_html = self._download_webpage(webpage_url, series_id)
	202
	203	# extract title
	204
	205	title = _get_element_by_tag_and_attrib(webpage_html, 'h1', 'class', 'title')
	206	title = unescapeHTML(title.group('content')) if title else None
	207	if not title:
	208	title = self._html_search_meta('keywords', webpage_html)
	209	if not title:
	210	title = _get_element_by_tag_and_attrib(webpage_html, 'title')
	211	title = unescapeHTML(title.group('content')) if title else None
	212
	213	# extract playlists
	214
	215	playlists = {}
	216	for div in _get_elements_by_tag_and_attrib(
	217	webpage_html, attribute='id', value='playlist\\d+', escape_value=False):
	218	playlist_id = div.group('value')
	219	playlist = []
	220	for a in _get_elements_by_tag_and_attrib(
	221	div.group('content'), 'a', 'href', value='[^\'"]+?', escape_value=False):
	222	playlist.append({
	223	'href': unescapeHTML(a.group('value')),
	224	'title': unescapeHTML(a.group('content'))
	225	})
	226	playlists[playlist_id] = playlist
	227
	228	# select the specified playlist if url fragment exists
d82b6697	229	playlist = None
	230	playlist_id = None
	231	if fragment:
	232	playlist = playlists.get(fragment)
	233	playlist_id = fragment
	234	else:
bf739292	235	first = next(iter(playlists.items()), None)
d82b6697	236	if first:
d82b6697	237	(playlist_id, playlist) = first
de4144a4	238	if not playlist:
	239	raise ExtractorError(
	240	'Cannot find %s' % fragment if fragment else 'Cannot extract playlist')
	241
	242	# return url results
	243	return self.playlist_result([
	244	self.url_result(
5bbe631e	245	compat_urlparse.urljoin('https://w.duboku.io', x['href']),
bf739292	246	ie=DubokuIE.ie_key(), video_title=x.get('title'))
d82b6697	247	for x in playlist], series_id + '#' + playlist_id, title)