X-Git-Url: https://jfr.im/git/yt-dlp.git/blobdiff_plain/86e5f3ed2e6e71eb81ea4c9e26288f16119ffd0c..61edf57f8f13f6dfd81154174e647eb5fdd26089:/yt_dlp/extractor/tube8.py diff --git a/yt_dlp/extractor/tube8.py b/yt_dlp/extractor/tube8.py index 32e80d9d2..7267bf2bd 100644 --- a/yt_dlp/extractor/tube8.py +++ b/yt_dlp/extractor/tube8.py @@ -1,14 +1,22 @@ import re +import urllib.parse +from .common import InfoExtractor +from ..aes import aes_decrypt_text from ..utils import ( + determine_ext, + format_field, int_or_none, str_to_int, + strip_or_none, + url_or_none, ) -from .keezmovies import KeezMoviesIE -class Tube8IE(KeezMoviesIE): +class Tube8IE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?tube8\.com/(?:[^/]+/)+(?P[^/]+)/(?P\d+)' + _EMBED_REGEX = [r']+\bsrc=["\'](?P(?:https?:)?//(?:www\.)?tube8\.com/embed/(?:[^/]+/)+\d+)'] _TESTS = [{ 'url': 'http://www.tube8.com/teen/kasia-music-video/229795/', 'md5': '65e20c48e6abff62ed0c3965fff13a39', @@ -29,11 +37,89 @@ class Tube8IE(KeezMoviesIE): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return re.findall( - r']+\bsrc=["\']((?:https?:)?//(?:www\.)?tube8\.com/embed/(?:[^/]+/)+\d+)', - webpage) + def _extract_info(self, url, fatal=True): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + display_id = (mobj.group('display_id') + if 'display_id' in mobj.groupdict() + else None) or mobj.group('id') + + webpage = self._download_webpage( + url, display_id, headers={'Cookie': 'age_verified=1'}) + + formats = [] + format_urls = set() + + title = None + thumbnail = None + duration = None + encrypted = False + + def extract_format(format_url, height=None): + format_url = url_or_none(format_url) + if not format_url or not format_url.startswith(('http', '//')): + return + if format_url in format_urls: + return + format_urls.add(format_url) + tbr = int_or_none(self._search_regex( + r'[/_](\d+)[kK][/_]', format_url, 'tbr', default=None)) + if not height: + height = int_or_none(self._search_regex( + r'[/_](\d+)[pP][/_]', format_url, 'height', default=None)) + if encrypted: + format_url = aes_decrypt_text( + video_url, title, 32).decode('utf-8') + formats.append({ + 'url': format_url, + 'format_id': format_field(height, None, '%dp'), + 'height': height, + 'tbr': tbr, + }) + + flashvars = self._parse_json( + self._search_regex( + r'flashvars\s*=\s*({.+?});', webpage, + 'flashvars', default='{}'), + display_id, fatal=False) + + if flashvars: + title = flashvars.get('video_title') + thumbnail = flashvars.get('image_url') + duration = int_or_none(flashvars.get('video_duration')) + encrypted = flashvars.get('encrypted') is True + for key, value in flashvars.items(): + mobj = re.search(r'quality_(\d+)[pP]', key) + if mobj: + extract_format(value, int(mobj.group(1))) + video_url = flashvars.get('video_url') + if video_url and determine_ext(video_url, None): + extract_format(video_url) + + video_url = self._html_search_regex( + r'flashvars\.video_url\s*=\s*(["\'])(?Phttp.+?)\1', + webpage, 'video url', default=None, group='url') + if video_url: + extract_format(urllib.parse.unquote(video_url)) + + if not formats: + if 'title="This video is no longer available"' in webpage: + self.raise_no_formats( + f'Video {video_id} is no longer available', expected=True) + + if not title: + title = self._html_search_regex( + r']*>([^<]+)', webpage, 'title') + + return webpage, { + 'id': video_id, + 'display_id': display_id, + 'title': strip_or_none(title), + 'thumbnail': thumbnail, + 'duration': duration, + 'age_limit': 18, + 'formats': formats, + } def _real_extract(self, url): webpage, info = self._extract_info(url) @@ -67,8 +153,8 @@ def _real_extract(self, url): tags_str = self._search_regex( r'(?s)Tags:\s*\s*
(.+?)]+href=[^>]+>([^<]+)', tags_str)] if tags_str else None + tags = list(re.findall( + r']+href=[^>]+>([^<]+)', tags_str)) if tags_str else None info.update({ 'description': description,