jfr.im git - yt-dlp.git/blame_incremental - yt

... / ...

Commit	Line	Data
	1	import re
	2
	3	from .common import InfoExtractor
	4	from ..compat import compat_urlparse
	5	from ..utils import (
	6	clean_html,
	7	extract_attributes,
	8	ExtractorError,
	9	get_elements_by_class,
	10	int_or_none,
	11	js_to_json,
	12	smuggle_url,
	13	unescapeHTML,
	14	)
	15
	16
	17	def _get_elements_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
	18	"""Return the content of the tag with the specified attribute in the passed HTML document"""
	19
	20	if tag is None:
	21	tag = '[a-zA-Z0-9:._-]+'
	22	if attribute is None:
	23	attribute = ''
	24	else:
	25	attribute = r'\s+(?P<attribute>%s)' % re.escape(attribute)
	26	if value is None:
	27	value = ''
	28	else:
	29	value = re.escape(value) if escape_value else value
	30	value = '=[\'"]?(?P<value>%s)[\'"]?' % value
	31
	32	retlist = []
	33	for m in re.finditer(r'''(?xs)
	34	<(?P<tag>%s)
	35	(?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]\|="[^"]"\|='[^']'\|))?
	36	%s%s
	37	(?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]\|="[^"]"\|='[^']'\|))?
	38	\s*>
	39	(?P<content>.*?)
	40	</\1>
	41	''' % (tag, attribute, value), html):
	42	retlist.append(m)
	43
	44	return retlist
	45
	46
	47	def _get_element_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
	48	retval = _get_elements_by_tag_and_attrib(html, tag, attribute, value, escape_value)
	49	return retval[0] if retval else None
	50
	51
	52	class DubokuIE(InfoExtractor):
	53	IE_NAME = 'duboku'
	54	IE_DESC = 'www.duboku.io'
	55
	56	_VALID_URL = r'(?:https?://[^/]+\.duboku\.io/vodplay/)(?P<id>[0-9]+-[0-9-]+)\.html.*'
	57	_TESTS = [{
	58	'url': 'https://w.duboku.io/vodplay/1575-1-1.html',
	59	'info_dict': {
	60	'id': '1575-1-1',
	61	'ext': 'mp4',
	62	'series': '白色月光',
	63	'title': 'contains:白色月光',
	64	'season_number': 1,
	65	'episode_number': 1,
	66	'season': 'Season 1',
	67	'episode_id': '1',
	68	'season_id': '1',
	69	'episode': 'Episode 1',
	70	},
	71	'params': {
	72	'skip_download': 'm3u8 download',
	73	},
	74	}, {
	75	'url': 'https://w.duboku.io/vodplay/1588-1-1.html',
	76	'info_dict': {
	77	'id': '1588-1-1',
	78	'ext': 'mp4',
	79	'series': '亲爱的自己',
	80	'title': 'contains:第1集',
	81	'season_number': 1,
	82	'episode_number': 1,
	83	'episode': 'Episode 1',
	84	'season': 'Season 1',
	85	'episode_id': '1',
	86	'season_id': '1',
	87	},
	88	'params': {
	89	'skip_download': 'm3u8 download',
	90	},
	91	}]
	92
	93	_PLAYER_DATA_PATTERN = r'player_data\s=\s(\{\s(.)})\s;?\s</script'
	94
	95	def _real_extract(self, url):
	96	video_id = self._match_id(url)
	97	temp = video_id.split('-')
	98	series_id = temp[0]
	99	season_id = temp[1]
	100	episode_id = temp[2]
	101
	102	webpage_url = 'https://w.duboku.io/vodplay/%s.html' % video_id
	103	webpage_html = self._download_webpage(webpage_url, video_id)
	104
	105	# extract video url
	106
	107	player_data = self._search_regex(
	108	self._PLAYER_DATA_PATTERN, webpage_html, 'player_data')
	109	player_data = self._parse_json(player_data, video_id, js_to_json)
	110
	111	# extract title
	112
	113	temp = get_elements_by_class('title', webpage_html)
	114	series_title = None
	115	title = None
	116	for html in temp:
	117	mobj = re.search(r'<a\s+.>(.)</a>', html)
	118	if mobj:
	119	href = extract_attributes(mobj.group(0)).get('href')
	120	if href:
	121	mobj1 = re.search(r'/(\d+)\.html', href)
	122	if mobj1 and mobj1.group(1) == series_id:
	123	series_title = clean_html(mobj.group(0))
	124	series_title = re.sub(r'[\s\r\n\t]+', ' ', series_title)
	125	title = clean_html(html)
	126	title = re.sub(r'[\s\r\n\t]+', ' ', title)
	127	break
	128
	129	data_url = player_data.get('url')
	130	if not data_url:
	131	raise ExtractorError('Cannot find url in player_data')
	132	data_from = player_data.get('from')
	133
	134	# if it is an embedded iframe, maybe it's an external source
	135	headers = {'Referer': webpage_url}
	136	if data_from == 'iframe':
	137	# use _type url_transparent to retain the meaningful details
	138	# of the video.
	139	return {
	140	'_type': 'url_transparent',
	141	'url': smuggle_url(data_url, {'http_headers': headers}),
	142	'id': video_id,
	143	'title': title,
	144	'series': series_title,
	145	'season_number': int_or_none(season_id),
	146	'season_id': season_id,
	147	'episode_number': int_or_none(episode_id),
	148	'episode_id': episode_id,
	149	}
	150
	151	formats = self._extract_m3u8_formats(data_url, video_id, 'mp4', headers=headers)
	152
	153	return {
	154	'id': video_id,
	155	'title': title,
	156	'series': series_title,
	157	'season_number': int_or_none(season_id),
	158	'season_id': season_id,
	159	'episode_number': int_or_none(episode_id),
	160	'episode_id': episode_id,
	161	'formats': formats,
	162	'http_headers': headers
	163	}
	164
	165
	166	class DubokuPlaylistIE(InfoExtractor):
	167	IE_NAME = 'duboku:list'
	168	IE_DESC = 'www.duboku.io entire series'
	169
	170	_VALID_URL = r'(?:https?://[^/]+\.duboku\.io/voddetail/)(?P<id>[0-9]+)\.html.*'
	171	_TESTS = [{
	172	'url': 'https://w.duboku.io/voddetail/1575.html',
	173	'info_dict': {
	174	'id': 'startswith:1575',
	175	'title': '白色月光',
	176	},
	177	'playlist_count': 12,
	178	}, {
	179	'url': 'https://w.duboku.io/voddetail/1554.html',
	180	'info_dict': {
	181	'id': 'startswith:1554',
	182	'title': '以家人之名',
	183	},
	184	'playlist_mincount': 30,
	185	}]
	186
	187	def _real_extract(self, url):
	188	mobj = self._match_valid_url(url)
	189	if mobj is None:
	190	raise ExtractorError('Invalid URL: %s' % url)
	191	series_id = mobj.group('id')
	192	fragment = compat_urlparse.urlparse(url).fragment
	193
	194	webpage_url = 'https://w.duboku.io/voddetail/%s.html' % series_id
	195	webpage_html = self._download_webpage(webpage_url, series_id)
	196
	197	# extract title
	198
	199	title = _get_element_by_tag_and_attrib(webpage_html, 'h1', 'class', 'title')
	200	title = unescapeHTML(title.group('content')) if title else None
	201	if not title:
	202	title = self._html_search_meta('keywords', webpage_html)
	203	if not title:
	204	title = _get_element_by_tag_and_attrib(webpage_html, 'title')
	205	title = unescapeHTML(title.group('content')) if title else None
	206
	207	# extract playlists
	208
	209	playlists = {}
	210	for div in _get_elements_by_tag_and_attrib(
	211	webpage_html, attribute='id', value='playlist\\d+', escape_value=False):
	212	playlist_id = div.group('value')
	213	playlist = []
	214	for a in _get_elements_by_tag_and_attrib(
	215	div.group('content'), 'a', 'href', value='[^\'"]+?', escape_value=False):
	216	playlist.append({
	217	'href': unescapeHTML(a.group('value')),
	218	'title': unescapeHTML(a.group('content'))
	219	})
	220	playlists[playlist_id] = playlist
	221
	222	# select the specified playlist if url fragment exists
	223	playlist = None
	224	playlist_id = None
	225	if fragment:
	226	playlist = playlists.get(fragment)
	227	playlist_id = fragment
	228	else:
	229	first = next(iter(playlists.items()), None)
	230	if first:
	231	(playlist_id, playlist) = first
	232	if not playlist:
	233	raise ExtractorError(
	234	'Cannot find %s' % fragment if fragment else 'Cannot extract playlist')
	235
	236	# return url results
	237	return self.playlist_result([
	238	self.url_result(
	239	compat_urlparse.urljoin('https://w.duboku.io', x['href']),
	240	ie=DubokuIE.ie_key(), video_title=x.get('title'))
	241	for x in playlist], series_id + '#' + playlist_id, title)