[yt-dlp.git] / yt_dlp / extractor / xfileshare.py

import re

from .common import InfoExtractor
from ..utils import (
    ExtractorError,
    decode_packed_codes,
    determine_ext,
    int_or_none,
    js_to_json,
    urlencode_postdata,
)


# based on openload_decode from 2bfeee69b976fe049761dd3012e30b637ee05a58
def aa_decode(aa_code):
    symbol_table = [
        ('7', '((ﾟｰﾟ) + (o^_^o))'),
        ('6', '((o^_^o) +(o^_^o))'),
        ('5', '((ﾟｰﾟ) + (ﾟΘﾟ))'),
        ('2', '((o^_^o) - (ﾟΘﾟ))'),
        ('4', '(ﾟｰﾟ)'),
        ('3', '(o^_^o)'),
        ('1', '(ﾟΘﾟ)'),
        ('0', '(c^_^o)'),
    ]
    delim = '(ﾟДﾟ)[ﾟεﾟ]+'
    ret = ''
    for aa_char in aa_code.split(delim):
        for val, pat in symbol_table:
            aa_char = aa_char.replace(pat, val)
        aa_char = aa_char.replace('+ ', '')
        m = re.match(r'^\d+', aa_char)
        if m:
            ret += chr(int(m.group(0), 8))
        else:
            m = re.match(r'^u([\da-f]+)', aa_char)
            if m:
                ret += chr(int(m.group(1), 16))
    return ret


class XFileShareIE(InfoExtractor):
    _SITES = (
        (r'aparat\.cam', 'Aparat'),
        (r'clipwatching\.com', 'ClipWatching'),
        (r'gounlimited\.to', 'GoUnlimited'),
        (r'govid\.me', 'GoVid'),
        (r'holavid\.com', 'HolaVid'),
        (r'streamty\.com', 'Streamty'),
        (r'thevideobee\.to', 'TheVideoBee'),
        (r'uqload\.com', 'Uqload'),
        (r'vidbom\.com', 'VidBom'),
        (r'vidlo\.us', 'vidlo'),
        (r'vidlocker\.xyz', 'VidLocker'),
        (r'vidshare\.tv', 'VidShare'),
        (r'vup\.to', 'VUp'),
        (r'wolfstream\.tv', 'WolfStream'),
        (r'xvideosharing\.com', 'XVideoSharing'),
    )

    IE_DESC = 'XFileShare based sites: %s' % ', '.join(list(zip(*_SITES))[1])
    _VALID_URL = (r'https?://(?:www\.)?(?P<host>%s)/(?:embed-)?(?P<id>[0-9a-zA-Z]+)'
                  % '|'.join(site for site in list(zip(*_SITES))[0]))
    _EMBED_REGEX = [r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:%s)/embed-[0-9a-zA-Z]+.*?)\1' % '|'.join(site for site in list(zip(*_SITES))[0])]

    _FILE_NOT_FOUND_REGEXES = (
        r'>(?:404 - )?File Not Found<',
        r'>The file was removed by administrator<',
    )

    _TESTS = [{
        'url': 'http://xvideosharing.com/fq65f94nd2ve',
        'md5': '4181f63957e8fe90ac836fa58dc3c8a6',
        'info_dict': {
            'id': 'fq65f94nd2ve',
            'ext': 'mp4',
            'title': 'sample',
            'thumbnail': r're:http://.*\.jpg',
        },
    }, {
        'url': 'https://aparat.cam/n4d6dh0wvlpr',
        'only_matching': True,
    }, {
        'url': 'https://wolfstream.tv/nthme29v9u2x',
        'only_matching': True,
    }]

    def _real_extract(self, url):
        host, video_id = self._match_valid_url(url).groups()

        url = 'https://%s/' % host + ('embed-%s.html' % video_id if host in ('govid.me', 'vidlo.us') else video_id)
        webpage = self._download_webpage(url, video_id)

        if any(re.search(p, webpage) for p in self._FILE_NOT_FOUND_REGEXES):
            raise ExtractorError('Video %s does not exist' % video_id, expected=True)

        fields = self._hidden_inputs(webpage)

        if fields.get('op') == 'download1':
            countdown = int_or_none(self._search_regex(
                r'<span id="countdown_str">(?:[Ww]ait)?\s*<span id="cxc">(\d+)</span>\s*(?:seconds?)?</span>',
                webpage, 'countdown', default=None))
            if countdown:
                self._sleep(countdown, video_id)

            webpage = self._download_webpage(
                url, video_id, 'Downloading video page',
                data=urlencode_postdata(fields), headers={
                    'Referer': url,
                    'Content-type': 'application/x-www-form-urlencoded',
                })

        title = (self._search_regex(
            (r'style="z-index: [0-9]+;">([^<]+)</span>',
             r'<td nowrap>([^<]+)</td>',
             r'h4-fine[^>]*>([^<]+)<',
             r'>Watch (.+)[ <]',
             r'<h2 class="video-page-head">([^<]+)</h2>',
             r'<h2 style="[^"]*color:#403f3d[^"]*"[^>]*>([^<]+)<',  # streamin.to
             r'title\s*:\s*"([^"]+)"'),  # govid.me
            webpage, 'title', default=None) or self._og_search_title(
            webpage, default=None) or video_id).strip()

        for regex, func in (
                (r'(eval\(function\(p,a,c,k,e,d\){.+)', decode_packed_codes),
                (r'(ﾟ.+)', aa_decode)):
            obf_code = self._search_regex(regex, webpage, 'obfuscated code', default=None)
            if obf_code:
                webpage = webpage.replace(obf_code, func(obf_code))

        formats = []

        jwplayer_data = self._search_regex(
            [
                r'jwplayer\("[^"]+"\)\.load\(\[({.+?})\]\);',
                r'jwplayer\("[^"]+"\)\.setup\(({.+?})\);',
            ], webpage,
            'jwplayer data', default=None)
        if jwplayer_data:
            jwplayer_data = self._parse_json(
                jwplayer_data.replace(r"\'", "'"), video_id, js_to_json)
            if jwplayer_data:
                formats = self._parse_jwplayer_data(
                    jwplayer_data, video_id, False,
                    m3u8_id='hls', mpd_id='dash')['formats']

        if not formats:
            urls = []
            for regex in (
                    r'(?:file|src)\s*:\s*(["\'])(?P<url>http(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1',
                    r'file_link\s*=\s*(["\'])(?P<url>http(?:(?!\1).)+)\1',
                    r'addVariable\((\\?["\'])file\1\s*,\s*(\\?["\'])(?P<url>http(?:(?!\2).)+)\2\)',
                    r'<embed[^>]+src=(["\'])(?P<url>http(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1'):
                for mobj in re.finditer(regex, webpage):
                    video_url = mobj.group('url')
                    if video_url not in urls:
                        urls.append(video_url)

            sources = self._search_regex(
                r'sources\s*:\s*(\[(?!{)[^\]]+\])', webpage, 'sources', default=None)
            if sources:
                urls.extend(self._parse_json(sources, video_id))

            formats = []
            for video_url in urls:
                if determine_ext(video_url) == 'm3u8':
                    formats.extend(self._extract_m3u8_formats(
                        video_url, video_id, 'mp4',
                        entry_protocol='m3u8_native', m3u8_id='hls',
                        fatal=False))
                else:
                    formats.append({
                        'url': video_url,
                        'format_id': 'sd',
                    })
        self._sort_formats(formats)

        thumbnail = self._search_regex(
            [
                r'<video[^>]+poster="([^"]+)"',
                r'(?:image|poster)\s*:\s*["\'](http[^"\']+)["\'],',
            ], webpage, 'thumbnail', default=None)

        return {
            'id': video_id,
            'title': title,
            'thumbnail': thumbnail,
            'formats': formats,
        }
Commit	Line	Data
617c0b22	1	import re
	2
	3	from .common import InfoExtractor
1cc79574	4	from ..utils import (
ac668111	5	ExtractorError,
bccdac68	6	decode_packed_codes,
2cd668ee	7	determine_ext,
ceb33673	8	int_or_none,
59296bae	9	js_to_json,
6e6bc8da	10	urlencode_postdata,
5f28a1ac PP	11	)
5f28a1ac PP	12
617c0b22	13
59296bae RA	14	# based on openload_decode from 2bfeee69b976fe049761dd3012e30b637ee05a58
	15	def aa_decode(aa_code):
	16	symbol_table = [
	17	('7', '((ﾟｰﾟ) + (o^_^o))'),
	18	('6', '((o^_^o) +(o^_^o))'),
	19	('5', '((ﾟｰﾟ) + (ﾟΘﾟ))'),
	20	('2', '((o^_^o) - (ﾟΘﾟ))'),
	21	('4', '(ﾟｰﾟ)'),
	22	('3', '(o^_^o)'),
	23	('1', '(ﾟΘﾟ)'),
	24	('0', '(c^_^o)'),
	25	]
	26	delim = '(ﾟДﾟ)[ﾟεﾟ]+'
	27	ret = ''
	28	for aa_char in aa_code.split(delim):
	29	for val, pat in symbol_table:
	30	aa_char = aa_char.replace(pat, val)
	31	aa_char = aa_char.replace('+ ', '')
	32	m = re.match(r'^\d+', aa_char)
	33	if m:
ac668111	34	ret += chr(int(m.group(0), 8))
59296bae RA	35	else:
	36	m = re.match(r'^u([\da-f]+)', aa_char)
	37	if m:
ac668111	38	ret += chr(int(m.group(1), 16))
59296bae RA	39	return ret
	40
	41
031ec536	42	class XFileShareIE(InfoExtractor):
41745523	43	_SITES = (
00dd0cd5	44	(r'aparat\.cam', 'Aparat'),
59296bae RA	45	(r'clipwatching\.com', 'ClipWatching'),
	46	(r'gounlimited\.to', 'GoUnlimited'),
	47	(r'govid\.me', 'GoVid'),
	48	(r'holavid\.com', 'HolaVid'),
	49	(r'streamty\.com', 'Streamty'),
2dc48df5	50	(r'thevideobee\.to', 'TheVideoBee'),
59296bae	51	(r'uqload\.com', 'Uqload'),
2dc48df5 S	52	(r'vidbom\.com', 'VidBom'),
2dc48df5 S	53	(r'vidlo\.us', 'vidlo'),
59296bae RA	54	(r'vidlocker\.xyz', 'VidLocker'),
	55	(r'vidshare\.tv', 'VidShare'),
	56	(r'vup\.to', 'VUp'),
41d1cca3	57	(r'wolfstream\.tv', 'WolfStream'),
59296bae	58	(r'xvideosharing\.com', 'XVideoSharing'),
41745523 S	59	)
	60
	61	IE_DESC = 'XFileShare based sites: %s' % ', '.join(list(zip(*_SITES))[1])
59296bae	62	_VALID_URL = (r'https?://(?:www\.)?(?P<host>%s)/(?:embed-)?(?P<id>[0-9a-zA-Z]+)'
2dc48df5	63	% '\|'.join(site for site in list(zip(*_SITES))[0]))
bfd973ec	64	_EMBED_REGEX = [r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:%s)/embed-[0-9a-zA-Z]+.?)\1' % '\|'.join(site for site in list(zip(_SITES))[0])]
5f28a1ac	65
33b72ce6 S	66	_FILE_NOT_FOUND_REGEXES = (
	67	r'>(?:404 - )?File Not Found<',
	68	r'>The file was removed by administrator<',
	69	)
3ae165aa	70
5f28a1ac	71	_TESTS = [{
59296bae RA	72	'url': 'http://xvideosharing.com/fq65f94nd2ve',
59296bae RA	73	'md5': '4181f63957e8fe90ac836fa58dc3c8a6',
953b3586	74	'info_dict': {
59296bae	75	'id': 'fq65f94nd2ve',
953b3586	76	'ext': 'mp4',
59296bae	77	'title': 'sample',
ec85ded8	78	'thumbnail': r're:http://.*\.jpg',
2cd668ee	79	},
00dd0cd5	80	}, {
	81	'url': 'https://aparat.cam/n4d6dh0wvlpr',
	82	'only_matching': True,
41d1cca3	83	}, {
	84	'url': 'https://wolfstream.tv/nthme29v9u2x',
	85	'only_matching': True,
5f28a1ac	86	}]
617c0b22	87
617c0b22	88	def _real_extract(self, url):
5ad28e7f	89	host, video_id = self._match_valid_url(url).groups()
617c0b22	90
59296bae	91	url = 'https://%s/' % host + ('embed-%s.html' % video_id if host in ('govid.me', 'vidlo.us') else video_id)
e213c98d	92	webpage = self._download_webpage(url, video_id)
617c0b22	93
33b72ce6	94	if any(re.search(p, webpage) for p in self._FILE_NOT_FOUND_REGEXES):
3ae165aa S	95	raise ExtractorError('Video %s does not exist' % video_id, expected=True)
3ae165aa S	96
f8da79f8	97	fields = self._hidden_inputs(webpage)
5f6a1245	98
59296bae	99	if fields.get('op') == 'download1':
ceb33673 S	100	countdown = int_or_none(self._search_regex(
	101	r'<span id="countdown_str">(?:[Ww]ait)?\s<span id="cxc">(\d+)</span>\s(?:seconds?)?</span>',
	102	webpage, 'countdown', default=None))
	103	if countdown:
	104	self._sleep(countdown, video_id)
	105
16bc9582 S	106	webpage = self._download_webpage(
	107	url, video_id, 'Downloading video page',
	108	data=urlencode_postdata(fields), headers={
	109	'Referer': url,
	110	'Content-type': 'application/x-www-form-urlencoded',
	111	})
5f28a1ac	112
668db403	113	title = (self._search_regex(
190d2027	114	(r'style="z-index: [0-9]+;">([^<]+)</span>',
b9ad1019	115	r'<td nowrap>([^<]+)</td>',
1ad61430	116	r'h4-fine[^>]*>([^<]+)<',
59296bae	117	r'>Watch (.+)[ <]',
190d2027	118	r'<h2 class="video-page-head">([^<]+)</h2>',
59296bae RA	119	r'<h2 style="[^"]color:#403f3d[^"]"[^>]*>([^<]+)<', # streamin.to
59296bae RA	120	r'title\s:\s"([^"]+)"'), # govid.me
190d2027 S	121	webpage, 'title', default=None) or self._og_search_title(
190d2027 S	122	webpage, default=None) or video_id).strip()
bccdac68	123
59296bae RA	124	for regex, func in (
	125	(r'(eval\(function\(p,a,c,k,e,d\){.+)', decode_packed_codes),
	126	(r'(ﾟ.+)', aa_decode)):
	127	obf_code = self._search_regex(regex, webpage, 'obfuscated code', default=None)
	128	if obf_code:
	129	webpage = webpage.replace(obf_code, func(obf_code))
	130
	131	formats = []
	132
	133	jwplayer_data = self._search_regex(
	134	[
	135	r'jwplayer\("[^"]+"\)\.load\(\[({.+?})\]\);',
	136	r'jwplayer\("[^"]+"\)\.setup\(({.+?})\);',
	137	], webpage,
	138	'jwplayer data', default=None)
	139	if jwplayer_data:
	140	jwplayer_data = self._parse_json(
	141	jwplayer_data.replace(r"\'", "'"), video_id, js_to_json)
	142	if jwplayer_data:
	143	formats = self._parse_jwplayer_data(
	144	jwplayer_data, video_id, False,
	145	m3u8_id='hls', mpd_id='dash')['formats']
	146
	147	if not formats:
2cd668ee S	148	urls = []
2cd668ee S	149	for regex in (
02d61a65	150	r'(?:file\|src)\s:\s(["\'])(?P<url>http(?:(?!\1).)+\.(?:m3u8\|mp4\|flv)(?:(?!\1).)*)\1',
2cd668ee S	151	r'file_link\s=\s(["\'])(?P<url>http(?:(?!\1).)+)\1',
	152	r'addVariable\((\\?["\'])file\1\s,\s(\\?["\'])(?P<url>http(?:(?!\2).)+)\2\)',
	153	r'<embed[^>]+src=(["\'])(?P<url>http(?:(?!\1).)+\.(?:m3u8\|mp4\|flv)(?:(?!\1).)*)\1'):
	154	for mobj in re.finditer(regex, webpage):
	155	video_url = mobj.group('url')
	156	if video_url not in urls:
	157	urls.append(video_url)
59296bae RA	158
	159	sources = self._search_regex(
	160	r'sources\s:\s(\[(?!{)[^\]]+\])', webpage, 'sources', default=None)
	161	if sources:
	162	urls.extend(self._parse_json(sources, video_id))
	163
2cd668ee S	164	formats = []
	165	for video_url in urls:
	166	if determine_ext(video_url) == 'm3u8':
	167	formats.extend(self._extract_m3u8_formats(
	168	video_url, video_id, 'mp4',
	169	entry_protocol='m3u8_native', m3u8_id='hls',
	170	fatal=False))
	171	else:
	172	formats.append({
	173	'url': video_url,
	174	'format_id': 'sd',
	175	})
59296bae	176	self._sort_formats(formats)
bccdac68	177
ceb33673	178	thumbnail = self._search_regex(
59296bae RA	179	[
	180	r'<video[^>]+poster="([^"]+)"',
	181	r'(?:image\|poster)\s:\s["\'](http[^"\']+)["\'],',
	182	], webpage, 'thumbnail', default=None)
5f28a1ac	183
5f28a1ac	184	return {
617c0b22	185	'id': video_id,
617c0b22	186	'title': title,
5f28a1ac PP	187	'thumbnail': thumbnail,
5f28a1ac PP	188	'formats': formats,
617c0b22	189	}