jfr.im git - yt-dlp.git/blame_incremental - yt

... / ...

Commit	Line	Data
	1	import base64
	2	import re
	3	import urllib.parse
	4
	5	from .common import InfoExtractor
	6	from ..utils import (
	7	ExtractorError,
	8	clean_html,
	9	extract_attributes,
	10	get_elements_by_class,
	11	int_or_none,
	12	js_to_json,
	13	smuggle_url,
	14	unescapeHTML,
	15	)
	16
	17
	18	def _get_elements_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
	19	"""Return the content of the tag with the specified attribute in the passed HTML document"""
	20
	21	if tag is None:
	22	tag = '[a-zA-Z0-9:._-]+'
	23	if attribute is None:
	24	attribute = ''
	25	else:
	26	attribute = rf'\s+(?P<attribute>{re.escape(attribute)})'
	27	if value is None:
	28	value = ''
	29	else:
	30	value = re.escape(value) if escape_value else value
	31	value = f'=[\'"]?(?P<value>{value})[\'"]?'
	32
	33	retlist = []
	34	for m in re.finditer(rf'''(?xs)
	35	<(?P<tag>{tag})
	36	(?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]\|="[^"]"\|='[^']'\|))?
	37	{attribute}{value}
	38	(?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]\|="[^"]"\|='[^']'\|))?
	39	\s*>
	40	(?P<content>.*?)
	41	</\1>
	42	''', html):
	43	retlist.append(m)
	44
	45	return retlist
	46
	47
	48	def _get_element_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
	49	retval = _get_elements_by_tag_and_attrib(html, tag, attribute, value, escape_value)
	50	return retval[0] if retval else None
	51
	52
	53	class DubokuIE(InfoExtractor):
	54	IE_NAME = 'duboku'
	55	IE_DESC = 'www.duboku.io'
	56
	57	_VALID_URL = r'(?:https?://[^/]+\.duboku\.io/vodplay/)(?P<id>[0-9]+-[0-9-]+)\.html.*'
	58	_TESTS = [{
	59	'url': 'https://w.duboku.io/vodplay/1575-1-1.html',
	60	'info_dict': {
	61	'id': '1575-1-1',
	62	'ext': 'mp4',
	63	'series': '白色月光',
	64	'title': 'contains:白色月光',
	65	'season_number': 1,
	66	'episode_number': 1,
	67	'season': 'Season 1',
	68	'episode_id': '1',
	69	'season_id': '1',
	70	'episode': 'Episode 1',
	71	},
	72	'params': {
	73	'skip_download': 'm3u8 download',
	74	},
	75	}, {
	76	'url': 'https://w.duboku.io/vodplay/1588-1-1.html',
	77	'info_dict': {
	78	'id': '1588-1-1',
	79	'ext': 'mp4',
	80	'series': '亲爱的自己',
	81	'title': 'contains:第1集',
	82	'season_number': 1,
	83	'episode_number': 1,
	84	'episode': 'Episode 1',
	85	'season': 'Season 1',
	86	'episode_id': '1',
	87	'season_id': '1',
	88	},
	89	'params': {
	90	'skip_download': 'm3u8 download',
	91	},
	92	}]
	93
	94	_PLAYER_DATA_PATTERN = r'player_data\s=\s(\{\s(.)})\s;?\s</script'
	95
	96	def _real_extract(self, url):
	97	video_id = self._match_id(url)
	98	temp = video_id.split('-')
	99	series_id = temp[0]
	100	season_id = temp[1]
	101	episode_id = temp[2]
	102
	103	webpage_url = f'https://w.duboku.io/vodplay/{video_id}.html'
	104	webpage_html = self._download_webpage(webpage_url, video_id)
	105
	106	# extract video url
	107
	108	player_data = self._search_regex(
	109	self._PLAYER_DATA_PATTERN, webpage_html, 'player_data')
	110	player_data = self._parse_json(player_data, video_id, js_to_json)
	111
	112	# extract title
	113
	114	temp = get_elements_by_class('title', webpage_html)
	115	series_title = None
	116	title = None
	117	for html in temp:
	118	mobj = re.search(r'<a\s+.>(.)</a>', html)
	119	if mobj:
	120	href = extract_attributes(mobj.group(0)).get('href')
	121	if href:
	122	mobj1 = re.search(r'/(\d+)\.html', href)
	123	if mobj1 and mobj1.group(1) == series_id:
	124	series_title = clean_html(mobj.group(0))
	125	series_title = re.sub(r'[\s\r\n\t]+', ' ', series_title)
	126	title = clean_html(html)
	127	title = re.sub(r'[\s\r\n\t]+', ' ', title)
	128	break
	129
	130	data_url = player_data.get('url')
	131	if not data_url:
	132	raise ExtractorError('Cannot find url in player_data')
	133	player_encrypt = player_data.get('encrypt')
	134	if player_encrypt == 1:
	135	data_url = urllib.parse.unquote(data_url)
	136	elif player_encrypt == 2:
	137	data_url = urllib.parse.unquote(base64.b64decode(data_url).decode('ascii'))
	138
	139	# if it is an embedded iframe, maybe it's an external source
	140	headers = {'Referer': webpage_url}
	141	if player_data.get('from') == 'iframe':
	142	# use _type url_transparent to retain the meaningful details
	143	# of the video.
	144	return {
	145	'_type': 'url_transparent',
	146	'url': smuggle_url(data_url, {'referer': webpage_url}),
	147	'id': video_id,
	148	'title': title,
	149	'series': series_title,
	150	'season_number': int_or_none(season_id),
	151	'season_id': season_id,
	152	'episode_number': int_or_none(episode_id),
	153	'episode_id': episode_id,
	154	}
	155
	156	formats = self._extract_m3u8_formats(data_url, video_id, 'mp4', headers=headers)
	157
	158	return {
	159	'id': video_id,
	160	'title': title,
	161	'series': series_title,
	162	'season_number': int_or_none(season_id),
	163	'season_id': season_id,
	164	'episode_number': int_or_none(episode_id),
	165	'episode_id': episode_id,
	166	'formats': formats,
	167	'http_headers': headers,
	168	}
	169
	170
	171	class DubokuPlaylistIE(InfoExtractor):
	172	IE_NAME = 'duboku:list'
	173	IE_DESC = 'www.duboku.io entire series'
	174
	175	_VALID_URL = r'(?:https?://[^/]+\.duboku\.io/voddetail/)(?P<id>[0-9]+)\.html.*'
	176	_TESTS = [{
	177	'url': 'https://w.duboku.io/voddetail/1575.html',
	178	'info_dict': {
	179	'id': 'startswith:1575',
	180	'title': '白色月光',
	181	},
	182	'playlist_count': 12,
	183	}, {
	184	'url': 'https://w.duboku.io/voddetail/1554.html',
	185	'info_dict': {
	186	'id': 'startswith:1554',
	187	'title': '以家人之名',
	188	},
	189	'playlist_mincount': 30,
	190	}]
	191
	192	def _real_extract(self, url):
	193	mobj = self._match_valid_url(url)
	194	if mobj is None:
	195	raise ExtractorError(f'Invalid URL: {url}')
	196	series_id = mobj.group('id')
	197	fragment = urllib.parse.urlparse(url).fragment
	198
	199	webpage_url = f'https://w.duboku.io/voddetail/{series_id}.html'
	200	webpage_html = self._download_webpage(webpage_url, series_id)
	201
	202	# extract title
	203
	204	title = _get_element_by_tag_and_attrib(webpage_html, 'h1', 'class', 'title')
	205	title = unescapeHTML(title.group('content')) if title else None
	206	if not title:
	207	title = self._html_search_meta('keywords', webpage_html)
	208	if not title:
	209	title = _get_element_by_tag_and_attrib(webpage_html, 'title')
	210	title = unescapeHTML(title.group('content')) if title else None
	211
	212	# extract playlists
	213
	214	playlists = {}
	215	for div in _get_elements_by_tag_and_attrib(
	216	webpage_html, attribute='id', value='playlist\\d+', escape_value=False):
	217	playlist_id = div.group('value')
	218	playlist = []
	219	for a in _get_elements_by_tag_and_attrib(
	220	div.group('content'), 'a', 'href', value='[^\'"]+?', escape_value=False):
	221	playlist.append({
	222	'href': unescapeHTML(a.group('value')),
	223	'title': unescapeHTML(a.group('content')),
	224	})
	225	playlists[playlist_id] = playlist
	226
	227	# select the specified playlist if url fragment exists
	228	playlist = None
	229	playlist_id = None
	230	if fragment:
	231	playlist = playlists.get(fragment)
	232	playlist_id = fragment
	233	else:
	234	first = next(iter(playlists.items()), None)
	235	if first:
	236	(playlist_id, playlist) = first
	237	if not playlist:
	238	raise ExtractorError(
	239	f'Cannot find {fragment}' if fragment else 'Cannot extract playlist')
	240
	241	# return url results
	242	return self.playlist_result([
	243	self.url_result(
	244	urllib.parse.urljoin('https://w.duboku.io', x['href']),
	245	ie=DubokuIE.ie_key(), video_title=x.get('title'))
	246	for x in playlist], series_id + '#' + playlist_id, title)