[yt-dlp.git] / youtube_dl / extractor / vuclip.py

from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..utils import (
    compat_urllib_parse_urlparse,
    parse_duration,
    qualities,
)


class VuClipIE(InfoExtractor):
    _VALID_URL = r'http://(?:m\.)?vuclip\.com/w\?.*?cid=(?P<id>[0-9]+)'

    _TEST = {
        'url': 'http://m.vuclip.com/w?cid=843902317&fid=63532&z=1007&nvar&frm=index.html&bu=4757321434',
        'md5': '92ac9d1ccefec4f0bb474661ab144fcf',
        'info_dict': {
            'id': '843902317',
            'ext': '3gp',
            'title': 'Movie Trailer: Noah',
            'duration': 139,
        }
    }

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        video_id = mobj.group('id')

        webpage = self._download_webpage(url, video_id)
        ad_m = re.search(
            r'''value="No.*?" onClick="location.href='([^"']+)'"''', webpage)
        if ad_m:
            urlr = compat_urllib_parse_urlparse(url)
            adfree_url = urlr.scheme + '://' + urlr.netloc + ad_m.group(1)
            webpage = self._download_webpage(
                adfree_url, video_id, note='Download post-ad page')

        links_code = self._search_regex(
            r'(?s)<div class="social align_c".*?>(.*?)<hr\s*/?>', webpage,
            'links')
        title = self._html_search_regex(
            r'<title>(.*?)-\s*Vuclip</title>', webpage, 'title').strip()

        quality_order = qualities(['Reg', 'Hi'])
        formats = []
        for url, q in re.findall(
                r'<a href="(?P<url>[^"]+)".*?>(?P<q>[^<]+)</a>', links_code):
            format_id = compat_urllib_parse_urlparse(url).scheme + '-' + q
            formats.append({
                'format_id': format_id,
                'url': url,
                'quality': quality_order(q),
            })
        self._sort_formats(formats)

        duration = parse_duration(self._search_regex(
            r'\(([0-9:]+)\)</span></h1>', webpage, 'duration', fatal=False))

        return {
            'id': video_id,
            'formats': formats,
            'title': title,
            'duration': duration,
        }
Commit	Line	Data
aec74dd9 PH	1	from __future__ import unicode_literals
	2
	3	import re
	4
	5	from .common import InfoExtractor
	6	from ..utils import (
	7	compat_urllib_parse_urlparse,
	8	parse_duration,
	9	qualities,
	10	)
	11
	12
	13	class VuClipIE(InfoExtractor):
f44e5d8b	14	_VALID_URL = r'http://(?:m\.)?vuclip\.com/w\?.*?cid=(?P<id>[0-9]+)'
aec74dd9 PH	15
	16	_TEST = {
	17	'url': 'http://m.vuclip.com/w?cid=843902317&fid=63532&z=1007&nvar&frm=index.html&bu=4757321434',
	18	'md5': '92ac9d1ccefec4f0bb474661ab144fcf',
	19	'info_dict': {
	20	'id': '843902317',
	21	'ext': '3gp',
	22	'title': 'Movie Trailer: Noah',
	23	'duration': 139,
	24	}
	25	}
	26
	27	def _real_extract(self, url):
	28	mobj = re.match(self._VALID_URL, url)
	29	video_id = mobj.group('id')
	30
	31	webpage = self._download_webpage(url, video_id)
	32	ad_m = re.search(
	33	r'''value="No.*?" onClick="location.href='([^"']+)'"''', webpage)
	34	if ad_m:
	35	urlr = compat_urllib_parse_urlparse(url)
	36	adfree_url = urlr.scheme + '://' + urlr.netloc + ad_m.group(1)
	37	webpage = self._download_webpage(
	38	adfree_url, video_id, note='Download post-ad page')
	39
	40	links_code = self._search_regex(
	41	r'(?s)<div class="social align_c".?>(.?)<hr\s*/?>', webpage,
	42	'links')
	43	title = self._html_search_regex(
	44	r'<title>(.?)-\sVuclip</title>', webpage, 'title').strip()
	45
	46	quality_order = qualities(['Reg', 'Hi'])
	47	formats = []
	48	for url, q in re.findall(
	49	r'<a href="(?P<url>[^"]+)".*?>(?P<q>[^<]+)</a>', links_code):
	50	format_id = compat_urllib_parse_urlparse(url).scheme + '-' + q
	51	formats.append({
	52	'format_id': format_id,
	53	'url': url,
	54	'quality': quality_order(q),
	55	})
	56	self._sort_formats(formats)
	57
	58	duration = parse_duration(self._search_regex(
	59	r'\(([0-9:]+)\)</span></h1>', webpage, 'duration', fatal=False))
	60
	61	return {
	62	'id': video_id,
	63	'formats': formats,
	64	'title': title,
	65	'duration': duration,
	66	}