[yt-dlp.git] / yt_dlp / extractor / vshare.py

# coding: utf-8
from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..compat import compat_chr
from ..utils import (
    decode_packed_codes,
    ExtractorError,
)


class VShareIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?vshare\.io/[dv]/(?P<id>[^/?#&]+)'
    _TESTS = [{
        'url': 'https://vshare.io/d/0f64ce6',
        'md5': '17b39f55b5497ae8b59f5fbce8e35886',
        'info_dict': {
            'id': '0f64ce6',
            'title': 'vl14062007715967',
            'ext': 'mp4',
        }
    }, {
        'url': 'https://vshare.io/v/0f64ce6/width-650/height-430/1',
        'only_matching': True,
    }]

    @staticmethod
    def _extract_urls(webpage):
        return re.findall(
            r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?vshare\.io/v/[^/?#&]+)',
            webpage)

    def _extract_packed(self, webpage):
        packed = self._search_regex(
            r'(eval\(function.+)', webpage, 'packed code')
        unpacked = decode_packed_codes(packed)
        digits = self._search_regex(r'\[((?:\d+,?)+)\]', unpacked, 'digits')
        digits = [int(digit) for digit in digits.split(',')]
        key_digit = self._search_regex(
            r'fromCharCode\(.+?(\d+)\)}', unpacked, 'key digit')
        chars = [compat_chr(d - int(key_digit)) for d in digits]
        return ''.join(chars)

    def _real_extract(self, url):
        video_id = self._match_id(url)

        webpage = self._download_webpage(
            'https://vshare.io/v/%s/width-650/height-430/1' % video_id,
            video_id, headers={'Referer': url})

        title = self._html_search_regex(
            r'<title>([^<]+)</title>', webpage, 'title')
        title = title.split(' - ')[0]

        error = self._html_search_regex(
            r'(?s)<div[^>]+\bclass=["\']xxx-error[^>]+>(.+?)</div', webpage,
            'error', default=None)
        if error:
            raise ExtractorError(error, expected=True)

        info = self._parse_html5_media_entries(
            url, '<video>%s</video>' % self._extract_packed(webpage),
            video_id)[0]

        self._sort_formats(info['formats'])

        info.update({
            'id': video_id,
            'title': title,
        })

        return info
Commit	Line	Data
2ab0bfcd S	1	# coding: utf-8
	2	from __future__ import unicode_literals
	3
0987f2dd T	4	import re
0987f2dd T	5
2ab0bfcd	6	from .common import InfoExtractor
0987f2dd	7	from ..compat import compat_chr
ff31f2d5 S	8	from ..utils import (
	9	decode_packed_codes,
	10	ExtractorError,
	11	)
2ab0bfcd S	12
	13
	14	class VShareIE(InfoExtractor):
	15	_VALID_URL = r'https?://(?:www\.)?vshare\.io/[dv]/(?P<id>[^/?#&]+)'
	16	_TESTS = [{
	17	'url': 'https://vshare.io/d/0f64ce6',
0987f2dd	18	'md5': '17b39f55b5497ae8b59f5fbce8e35886',
2ab0bfcd S	19	'info_dict': {
	20	'id': '0f64ce6',
	21	'title': 'vl14062007715967',
	22	'ext': 'mp4',
	23	}
	24	}, {
	25	'url': 'https://vshare.io/v/0f64ce6/width-650/height-430/1',
	26	'only_matching': True,
	27	}]
	28
a2b6aba8 S	29	@staticmethod
	30	def _extract_urls(webpage):
	31	return re.findall(
	32	r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?vshare\.io/v/[^/?#&]+)',
	33	webpage)
	34
0987f2dd	35	def _extract_packed(self, webpage):
a2b6aba8 S	36	packed = self._search_regex(
a2b6aba8 S	37	r'(eval\(function.+)', webpage, 'packed code')
0987f2dd T	38	unpacked = decode_packed_codes(packed)
0987f2dd T	39	digits = self._search_regex(r'\[((?:\d+,?)+)\]', unpacked, 'digits')
a2b6aba8 S	40	digits = [int(digit) for digit in digits.split(',')]
	41	key_digit = self._search_regex(
	42	r'fromCharCode\(.+?(\d+)\)}', unpacked, 'key digit')
0987f2dd T	43	chars = [compat_chr(d - int(key_digit)) for d in digits]
	44	return ''.join(chars)
	45
2ab0bfcd S	46	def _real_extract(self, url):
	47	video_id = self._match_id(url)
	48
	49	webpage = self._download_webpage(
a2b6aba8	50	'https://vshare.io/v/%s/width-650/height-430/1' % video_id,
794c1b6e	51	video_id, headers={'Referer': url})
2ab0bfcd	52
a2b6aba8 S	53	title = self._html_search_regex(
a2b6aba8 S	54	r'<title>([^<]+)</title>', webpage, 'title')
0987f2dd	55	title = title.split(' - ')[0]
2ab0bfcd	56
ff31f2d5 S	57	error = self._html_search_regex(
	58	r'(?s)<div[^>]+\bclass=["\']xxx-error[^>]+>(.+?)</div', webpage,
	59	'error', default=None)
	60	if error:
	61	raise ExtractorError(error, expected=True)
	62
a2b6aba8 S	63	info = self._parse_html5_media_entries(
	64	url, '<video>%s</video>' % self._extract_packed(webpage),
	65	video_id)[0]
	66
	67	self._sort_formats(info['formats'])
	68
	69	info.update({
2ab0bfcd S	70	'id': video_id,
2ab0bfcd S	71	'title': title,
a2b6aba8	72	})
0987f2dd	73
a2b6aba8	74	return info