[yt-dlp.git] / youtube_dl / extractor / duboku.py

# coding: utf-8
from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..compat import compat_urlparse
from ..utils import (
    clean_html,
    extract_attributes,
    ExtractorError,
    get_elements_by_class,
    int_or_none,
    js_to_json,
    smuggle_url,
    unescapeHTML,
)


def _get_elements_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
    """Return the content of the tag with the specified attribute in the passed HTML document"""

    if tag is None:
        tag = '[a-zA-Z0-9:._-]+'
    if attribute is None:
        attribute = ''
    else:
        attribute = r'\s+(?P<attribute>%s)' % re.escape(attribute)
    if value is None:
        value = ''
    else:
        value = re.escape(value) if escape_value else value
        value = '=[\'"]?(?P<value>%s)[\'"]?' % value

    retlist = []
    for m in re.finditer(r'''(?xs)
        <(?P<tag>%s)
         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
         %s%s
         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
        \s*>
        (?P<content>.*?)
        </\1>
    ''' % (tag, attribute, value), html):
        retlist.append(m)

    return retlist


def _get_element_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
    retval = _get_elements_by_tag_and_attrib(html, tag, attribute, value, escape_value)
    return retval[0] if retval else None


class DubokuIE(InfoExtractor):
    IE_NAME = 'duboku'
    IE_DESC = 'www.duboku.co'

    _VALID_URL = r'(?:https?://[^/]+\.duboku\.co/vodplay/)(?P<id>[0-9]+-[0-9-]+)\.html.*'
    _TESTS = [{
        'url': 'https://www.duboku.co/vodplay/1575-1-1.html',
        'info_dict': {
            'id': '1575-1-1',
            'ext': 'ts',
            'series': '白色月光',
            'title': 'contains:白色月光',
            'season_number': 1,
            'episode_number': 1,
        },
        'params': {
            'skip_download': 'm3u8 download',
        },
    }, {
        'url': 'https://www.duboku.co/vodplay/1588-1-1.html',
        'info_dict': {
            'id': '1588-1-1',
            'ext': 'ts',
            'series': '亲爱的自己',
            'title': 'contains:预告片',
            'season_number': 1,
            'episode_number': 1,
        },
        'params': {
            'skip_download': 'm3u8 download',
        },
    }]

    _PLAYER_DATA_PATTERN = r'player_data\s*=\s*(\{\s*(.*)})\s*;?\s*</script'

    def _real_extract(self, url):
        video_id = self._match_id(url)
        temp = video_id.split('-')
        series_id = temp[0]
        season_id = temp[1]
        episode_id = temp[2]

        webpage_url = 'https://www.duboku.co/vodplay/%s.html' % video_id
        webpage_html = self._download_webpage(webpage_url, video_id)

        # extract video url

        player_data = self._search_regex(
            self._PLAYER_DATA_PATTERN, webpage_html, 'player_data')
        player_data = self._parse_json(js_to_json(player_data), video_id)

        # extract title

        temp = get_elements_by_class('title', webpage_html)
        series_title = None
        title = None
        for html in temp:
            mobj = re.search(r'<a\s+.*>(.*)</a>', html)
            if mobj:
                href = extract_attributes(mobj.group(0)).get('href')
                if href:
                    mobj1 = re.search(r'/(\d+)\.html', href)
                    if mobj1 and mobj1.group(1) == series_id:
                        series_title = clean_html(mobj.group(0))
                        series_title = re.sub(r'[\s\r\n\t]+', ' ', series_title)
                        title = clean_html(html)
                        title = re.sub(r'[\s\r\n\t]+', ' ', title)
                        break

        data_url = player_data['url']
        assert data_url
        data_from = player_data.get('from')

        # if it is an embedded iframe, maybe it's an external source
        if data_from == 'iframe':
            # use _type url_transparent to retain the meaningful details
            # of the video.
            return {
                '_type': 'url_transparent',
                'url': smuggle_url(data_url, {'http_headers': {'Referer': webpage_url}}),
                'id': video_id,
                'title': title,
                'series': series_title,
                'season_number': int_or_none(season_id),
                'season_id': season_id,
                'episode_number': int_or_none(episode_id),
                'episode_id': episode_id,
            }

        formats = self._extract_m3u8_formats(data_url, video_id, 'ts')

        return {
            'id': video_id,
            'title': title,
            'series': series_title,
            'season_number': int_or_none(season_id),
            'season_id': season_id,
            'episode_number': int_or_none(episode_id),
            'episode_id': episode_id,
            'formats': formats,
        }


class DubokuPlaylistIE(InfoExtractor):
    IE_NAME = 'duboku:list'
    IE_DESC = 'www.duboku.co entire series'

    _VALID_URL = r'(?:https?://[^/]+\.duboku\.co/voddetail/)(?P<id>[0-9]+)\.html.*'
    _TESTS = [{
        'url': 'https://www.duboku.co/voddetail/1575.html',
        'info_dict': {
            'id': 'startswith:1575',
            'title': '白色月光',
        },
        'playlist_count': 12,
    }, {
        'url': 'https://www.duboku.co/voddetail/1554.html',
        'info_dict': {
            'id': 'startswith:1554',
            'title': '以家人之名',
        },
        'playlist_mincount': 30,
    }, {
        'url': 'https://www.duboku.co/voddetail/1554.html#playlist2',
        'info_dict': {
            'id': '1554#playlist2',
            'title': '以家人之名',
        },
        'playlist_mincount': 27,
    }]

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        if mobj is None:
            raise ExtractorError('Invalid URL: %s' % url)
        series_id = mobj.group('id')
        fragment = compat_urlparse.urlparse(url).fragment

        webpage_url = 'https://www.duboku.co/voddetail/%s.html' % series_id
        webpage_html = self._download_webpage(webpage_url, series_id)

        # extract title

        title = _get_element_by_tag_and_attrib(webpage_html, 'h1', 'class', 'title')
        title = unescapeHTML(title.group('content')) if title else None
        if not title:
            title = self._html_search_meta('keywords', webpage_html)
        if not title:
            title = _get_element_by_tag_and_attrib(webpage_html, 'title')
            title = unescapeHTML(title.group('content')) if title else None

        # extract playlists

        playlists = {}
        for div in _get_elements_by_tag_and_attrib(
                webpage_html, attribute='id', value='playlist\\d+', escape_value=False):
            playlist_id = div.group('value')
            playlist = []
            for a in _get_elements_by_tag_and_attrib(
                    div.group('content'), 'a', 'href', value='[^\'"]+?', escape_value=False):
                playlist.append({
                    'href': unescapeHTML(a.group('value')),
                    'title': unescapeHTML(a.group('content'))
                })
            playlists[playlist_id] = playlist

        # select the specified playlist if url fragment exists
        playlist = None
        playlist_id = None
        if fragment:
            playlist = playlists.get(fragment)
            playlist_id = fragment
        else:
            first = next(iter(playlists.items()))
            if first:
                (playlist_id, playlist) = first
        if not playlist:
            raise ExtractorError(
                'Cannot find %s' % fragment if fragment else 'Cannot extract playlist')

        # return url results
        return self.playlist_result([
            self.url_result(
                'https://www.duboku.co' + x['href'], DubokuIE.IE_NAME, video_title=x.get('title'))
            for x in playlist], series_id + '#' + playlist_id, title)
Commit	Line	Data
503406d4	1	# coding: utf-8
	2	from __future__ import unicode_literals
	3
	4	import re
	5
	6	from .common import InfoExtractor
de4144a4	7	from ..compat import compat_urlparse
7cc9d5b3	8	from ..utils import (
	9	clean_html,
	10	extract_attributes,
	11	ExtractorError,
	12	get_elements_by_class,
	13	int_or_none,
	14	js_to_json,
	15	smuggle_url,
	16	unescapeHTML,
	17	)
503406d4	18
503406d4	19
de4144a4	20	def _get_elements_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
	21	"""Return the content of the tag with the specified attribute in the passed HTML document"""
	22
	23	if tag is None:
	24	tag = '[a-zA-Z0-9:._-]+'
	25	if attribute is None:
	26	attribute = ''
	27	else:
	28	attribute = r'\s+(?P<attribute>%s)' % re.escape(attribute)
	29	if value is None:
	30	value = ''
	31	else:
	32	value = re.escape(value) if escape_value else value
	33	value = '=[\'"]?(?P<value>%s)[\'"]?' % value
	34
	35	retlist = []
	36	for m in re.finditer(r'''(?xs)
	37	<(?P<tag>%s)
	38	(?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]\|="[^"]"\|='[^']'\|))?
	39	%s%s
	40	(?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]\|="[^"]"\|='[^']'\|))?
	41	\s*>
	42	(?P<content>.*?)
	43	</\1>
	44	''' % (tag, attribute, value), html):
	45	retlist.append(m)
	46
	47	return retlist
	48
	49
	50	def _get_element_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
	51	retval = _get_elements_by_tag_and_attrib(html, tag, attribute, value, escape_value)
	52	return retval[0] if retval else None
	53
	54
503406d4	55	class DubokuIE(InfoExtractor):
de4144a4	56	IE_NAME = 'duboku'
	57	IE_DESC = 'www.duboku.co'
	58
a8f88d2f	59	_VALID_URL = r'(?:https?://[^/]+\.duboku\.co/vodplay/)(?P<id>[0-9]+-[0-9-]+)\.html.*'
503406d4	60	_TESTS = [{
	61	'url': 'https://www.duboku.co/vodplay/1575-1-1.html',
	62	'info_dict': {
	63	'id': '1575-1-1',
7cc9d5b3	64	'ext': 'ts',
	65	'series': '白色月光',
	66	'title': 'contains:白色月光',
	67	'season_number': 1,
	68	'episode_number': 1,
	69	},
	70	'params': {
	71	'skip_download': 'm3u8 download',
	72	},
	73	}, {
	74	'url': 'https://www.duboku.co/vodplay/1588-1-1.html',
	75	'info_dict': {
	76	'id': '1588-1-1',
	77	'ext': 'ts',
	78	'series': '亲爱的自己',
	79	'title': 'contains:预告片',
	80	'season_number': 1,
	81	'episode_number': 1,
503406d4	82	},
	83	'params': {
	84	'skip_download': 'm3u8 download',
	85	},
	86	}]
	87
	88	_PLAYER_DATA_PATTERN = r'player_data\s=\s(\{\s(.)})\s;?\s</script'
	89
	90	def _real_extract(self, url):
	91	video_id = self._match_id(url)
	92	temp = video_id.split('-')
	93	series_id = temp[0]
	94	season_id = temp[1]
	95	episode_id = temp[2]
	96
	97	webpage_url = 'https://www.duboku.co/vodplay/%s.html' % video_id
	98	webpage_html = self._download_webpage(webpage_url, video_id)
	99
	100	# extract video url
	101
	102	player_data = self._search_regex(
	103	self._PLAYER_DATA_PATTERN, webpage_html, 'player_data')
	104	player_data = self._parse_json(js_to_json(player_data), video_id)
	105
	106	# extract title
	107
	108	temp = get_elements_by_class('title', webpage_html)
	109	series_title = None
	110	title = None
	111	for html in temp:
	112	mobj = re.search(r'<a\s+.>(.)</a>', html)
	113	if mobj:
	114	href = extract_attributes(mobj.group(0)).get('href')
	115	if href:
	116	mobj1 = re.search(r'/(\d+)\.html', href)
	117	if mobj1 and mobj1.group(1) == series_id:
	118	series_title = clean_html(mobj.group(0))
	119	series_title = re.sub(r'[\s\r\n\t]+', ' ', series_title)
	120	title = clean_html(html)
	121	title = re.sub(r'[\s\r\n\t]+', ' ', title)
	122	break
	123
	124	data_url = player_data['url']
	125	assert data_url
	126	data_from = player_data.get('from')
	127
	128	# if it is an embedded iframe, maybe it's an external source
	129	if data_from == 'iframe':
	130	# use _type url_transparent to retain the meaningful details
	131	# of the video.
	132	return {
	133	'_type': 'url_transparent',
	134	'url': smuggle_url(data_url, {'http_headers': {'Referer': webpage_url}}),
	135	'id': video_id,
	136	'title': title,
	137	'series': series_title,
	138	'season_number': int_or_none(season_id),
	139	'season_id': season_id,
	140	'episode_number': int_or_none(episode_id),
	141	'episode_id': episode_id,
	142	}
	143
	144	formats = self._extract_m3u8_formats(data_url, video_id, 'ts')
	145
146	return {
147	'id': video_id,
148	'title': title,
149	'series': series_title,
150	'season_number': int_or_none(season_id),
151	'season_id': season_id,
152	'episode_number': int_or_none(episode_id),
153	'episode_id': episode_id,
154	'formats': formats,
155	}
de4144a4	156
	157
	158	class DubokuPlaylistIE(InfoExtractor):
	159	IE_NAME = 'duboku:list'
	160	IE_DESC = 'www.duboku.co entire series'
	161
	162	_VALID_URL = r'(?:https?://[^/]+\.duboku\.co/voddetail/)(?P<id>[0-9]+)\.html.*'
d82b6697	163	_TESTS = [{
7cc9d5b3	164	'url': 'https://www.duboku.co/voddetail/1575.html',
d82b6697	165	'info_dict': {
7cc9d5b3	166	'id': 'startswith:1575',
d82b6697	167	'title': '白色月光',
	168	},
	169	'playlist_count': 12,
	170	}, {
7cc9d5b3	171	'url': 'https://www.duboku.co/voddetail/1554.html',
d82b6697	172	'info_dict': {
7cc9d5b3	173	'id': 'startswith:1554',
d82b6697	174	'title': '以家人之名',
d82b6697	175	},
7cc9d5b3	176	'playlist_mincount': 30,
d82b6697	177	}, {
7cc9d5b3	178	'url': 'https://www.duboku.co/voddetail/1554.html#playlist2',
d82b6697	179	'info_dict': {
	180	'id': '1554#playlist2',
	181	'title': '以家人之名',
	182	},
7cc9d5b3	183	'playlist_mincount': 27,
d82b6697	184	}]
de4144a4	185
	186	def _real_extract(self, url):
	187	mobj = re.match(self._VALID_URL, url)
	188	if mobj is None:
	189	raise ExtractorError('Invalid URL: %s' % url)
	190	series_id = mobj.group('id')
	191	fragment = compat_urlparse.urlparse(url).fragment
	192
	193	webpage_url = 'https://www.duboku.co/voddetail/%s.html' % series_id
	194	webpage_html = self._download_webpage(webpage_url, series_id)
	195
	196	# extract title
	197
	198	title = _get_element_by_tag_and_attrib(webpage_html, 'h1', 'class', 'title')
	199	title = unescapeHTML(title.group('content')) if title else None
	200	if not title:
	201	title = self._html_search_meta('keywords', webpage_html)
	202	if not title:
	203	title = _get_element_by_tag_and_attrib(webpage_html, 'title')
	204	title = unescapeHTML(title.group('content')) if title else None
	205
	206	# extract playlists
	207
	208	playlists = {}
	209	for div in _get_elements_by_tag_and_attrib(
	210	webpage_html, attribute='id', value='playlist\\d+', escape_value=False):
	211	playlist_id = div.group('value')
	212	playlist = []
	213	for a in _get_elements_by_tag_and_attrib(
	214	div.group('content'), 'a', 'href', value='[^\'"]+?', escape_value=False):
	215	playlist.append({
	216	'href': unescapeHTML(a.group('value')),
	217	'title': unescapeHTML(a.group('content'))
	218	})
	219	playlists[playlist_id] = playlist
	220
	221	# select the specified playlist if url fragment exists
d82b6697	222	playlist = None
	223	playlist_id = None
	224	if fragment:
	225	playlist = playlists.get(fragment)
	226	playlist_id = fragment
	227	else:
	228	first = next(iter(playlists.items()))
	229	if first:
	230	(playlist_id, playlist) = first
de4144a4	231	if not playlist:
	232	raise ExtractorError(
	233	'Cannot find %s' % fragment if fragment else 'Cannot extract playlist')
	234
	235	# return url results
	236	return self.playlist_result([
	237	self.url_result(
d82b6697	238	'https://www.duboku.co' + x['href'], DubokuIE.IE_NAME, video_title=x.get('title'))
d82b6697	239	for x in playlist], series_id + '#' + playlist_id, title)