[yt-dlp.git] / youtube_dl / extractor / metacafe.py

from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..utils import (
    compat_parse_qs,
    compat_urllib_parse,
    compat_urllib_request,
    determine_ext,
    ExtractorError,
)


class MetacafeIE(InfoExtractor):
    _VALID_URL = r'http://(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
    _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
    _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
    IE_NAME = 'metacafe'
    _TESTS = [
        # Youtube video
        {
            'add_ie': ['Youtube'],
            'url':  'http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/',
            'info_dict': {
                'id': '_aUehQsCQtM',
                'ext': 'mp4',
                'upload_date': '20090102',
                'title': 'The Electric Company | "Short I" | PBS KIDS GO!',
                'description': 'md5:2439a8ef6d5a70e380c22f5ad323e5a8',
                'uploader': 'PBS',
                'uploader_id': 'PBS'
            }
        },
        # Normal metacafe video
        {
            'url': 'http://www.metacafe.com/watch/11121940/news_stuff_you_wont_do_with_your_playstation_4/',
            'md5': '6e0bca200eaad2552e6915ed6fd4d9ad',
            'info_dict': {
                'id': '11121940',
                'ext': 'mp4',
                'title': 'News: Stuff You Won\'t Do with Your PlayStation 4',
                'uploader': 'ign',
                'description': 'Sony released a massive FAQ on the PlayStation Blog detailing the PS4\'s capabilities and limitations.',
            },
        },
        # AnyClip video
        {
            'url': 'http://www.metacafe.com/watch/an-dVVXnuY7Jh77J/the_andromeda_strain_1971_stop_the_bomb_part_3/',
            'info_dict': {
                'id': 'an-dVVXnuY7Jh77J',
                'ext': 'mp4',
                'title': 'The Andromeda Strain (1971): Stop the Bomb Part 3',
                'uploader': 'anyclip',
                'description': 'md5:38c711dd98f5bb87acf973d573442e67',
            },
        },
        # age-restricted video
        {
            'url': 'http://www.metacafe.com/watch/5186653/bbc_internal_christmas_tape_79_uncensored_outtakes_etc/',
            'md5': '98dde7c1a35d02178e8ab7560fe8bd09',
            'info_dict': {
                'id': '5186653',
                'ext': 'mp4',
                'title': 'BBC INTERNAL Christmas Tape \'79 - UNCENSORED Outtakes, Etc.',
                'uploader': 'Dwayne Pipe',
                'description': 'md5:950bf4c581e2c059911fa3ffbe377e4b',
                'age_limit': 18,
            },
        },
        # cbs video
        {
            'url': 'http://www.metacafe.com/watch/cb-8VD4r_Zws8VP/open_this_is_face_the_nation_february_9/',
            'info_dict': {
                'id': '8VD4r_Zws8VP',
                'ext': 'flv',
                'title': 'Open: This is Face the Nation, February 9',
                'description': 'md5:8a9ceec26d1f7ed6eab610834cc1a476',
                'duration': 96,
            },
            'params': {
                # rtmp download
                'skip_download': True,
            },
        },
    ]

    def report_disclaimer(self):
        self.to_screen('Retrieving disclaimer')

    def _real_initialize(self):
        # Retrieve disclaimer
        self.report_disclaimer()
        self._download_webpage(self._DISCLAIMER, None, False, 'Unable to retrieve disclaimer')

        # Confirm age
        disclaimer_form = {
            'filters': '0',
            'submit': "Continue - I'm over 18",
        }
        request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
        request.add_header('Content-Type', 'application/x-www-form-urlencoded')
        self.report_age_confirmation()
        self._download_webpage(request, None, False, 'Unable to confirm age')

    def _real_extract(self, url):
        # Extract id and simplified title from URL
        mobj = re.match(self._VALID_URL, url)
        if mobj is None:
            raise ExtractorError('Invalid URL: %s' % url)

        video_id = mobj.group(1)

        # the video may come from an external site
        m_external = re.match('^(\w{2})-(.*)$', video_id)
        if m_external is not None:
            prefix, ext_id = m_external.groups()
            # Check if video comes from YouTube
            if prefix == 'yt':
                return self.url_result('http://www.youtube.com/watch?v=%s' % ext_id, 'Youtube')
            # CBS videos use theplatform.com
            if prefix == 'cb':
                return self.url_result('theplatform:%s' % ext_id, 'ThePlatform')

        # Retrieve video webpage to extract further information
        req = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)

        # AnyClip videos require the flashversion cookie so that we get the link
        # to the mp4 file
        mobj_an = re.match(r'^an-(.*?)$', video_id)
        if mobj_an:
            req.headers['Cookie'] = 'flashVersion=0;'
        webpage = self._download_webpage(req, video_id)

        # Extract URL, uploader and title from webpage
        self.report_extraction(video_id)
        mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
        if mobj is not None:
            mediaURL = compat_urllib_parse.unquote(mobj.group(1))
            video_ext = mediaURL[-3:]

            # Extract gdaKey if available
            mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
            if mobj is None:
                video_url = mediaURL
            else:
                gdaKey = mobj.group(1)
                video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
        else:
            mobj = re.search(r'<video src="([^"]+)"', webpage)
            if mobj:
                video_url = mobj.group(1)
                video_ext = 'mp4'
            else:
                flashvars = self._search_regex(
                    r' name="flashvars" value="(.*?)"', webpage, 'flashvars')
                vardict = compat_parse_qs(flashvars)
                if 'mediaData' not in vardict:
                    raise ExtractorError('Unable to extract media URL')
                mobj = re.search(
                    r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
                if mobj is None:
                    raise ExtractorError('Unable to extract media URL')
                mediaURL = mobj.group('mediaURL').replace('\\/', '/')
                video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
                video_ext = determine_ext(video_url)

        video_title = self._html_search_regex(
            r'(?im)<title>(.*) - Video</title>', webpage, 'title')
        description = self._og_search_description(webpage)
        thumbnail = self._og_search_thumbnail(webpage)
        video_uploader = self._html_search_regex(
                r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);',
                webpage, 'uploader nickname', fatal=False)

        if re.search(r'"contentRating":"restricted"', webpage) is not None:
            age_limit = 18
        else:
            age_limit = 0

        return {
            'id': video_id,
            'url': video_url,
            'description': description,
            'uploader': video_uploader,
            'title': video_title,
            'thumbnail': thumbnail,
            'ext': video_ext,
            'age_limit': age_limit,
        }
Commit	Line	Data
be3b8fa3 S	1	from __future__ import unicode_literals
be3b8fa3 S	2
38cbc40a	3	import re
38cbc40a PH	4
	5	from .common import InfoExtractor
	6	from ..utils import (
38cbc40a	7	compat_parse_qs,
38cbc40a PH	8	compat_urllib_parse,
38cbc40a PH	9	compat_urllib_request,
896d5b63	10	determine_ext,
38cbc40a PH	11	ExtractorError,
	12	)
	13
38cbc40a	14
be3b8fa3	15	class MetacafeIE(InfoExtractor):
401983c6	16	_VALID_URL = r'http://(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
38cbc40a PH	17	_DISCLAIMER = 'http://www.metacafe.com/family_filter/'
38cbc40a PH	18	_FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
be3b8fa3	19	IE_NAME = 'metacafe'
66cf3ac3	20	_TESTS = [
be3b8fa3 S	21	# Youtube video
	22	{
	23	'add_ie': ['Youtube'],
	24	'url': 'http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/',
	25	'info_dict': {
	26	'id': '_aUehQsCQtM',
	27	'ext': 'mp4',
	28	'upload_date': '20090102',
401983c6	29	'title': 'The Electric Company \| "Short I" \| PBS KIDS GO!',
be3b8fa3 S	30	'description': 'md5:2439a8ef6d5a70e380c22f5ad323e5a8',
	31	'uploader': 'PBS',
	32	'uploader_id': 'PBS'
	33	}
66cf3ac3	34	},
be3b8fa3 S	35	# Normal metacafe video
	36	{
	37	'url': 'http://www.metacafe.com/watch/11121940/news_stuff_you_wont_do_with_your_playstation_4/',
	38	'md5': '6e0bca200eaad2552e6915ed6fd4d9ad',
	39	'info_dict': {
	40	'id': '11121940',
	41	'ext': 'mp4',
	42	'title': 'News: Stuff You Won\'t Do with Your PlayStation 4',
	43	'uploader': 'ign',
	44	'description': 'Sony released a massive FAQ on the PlayStation Blog detailing the PS4\'s capabilities and limitations.',
	45	},
66cf3ac3	46	},
be3b8fa3 S	47	# AnyClip video
	48	{
	49	'url': 'http://www.metacafe.com/watch/an-dVVXnuY7Jh77J/the_andromeda_strain_1971_stop_the_bomb_part_3/',
	50	'info_dict': {
	51	'id': 'an-dVVXnuY7Jh77J',
	52	'ext': 'mp4',
	53	'title': 'The Andromeda Strain (1971): Stop the Bomb Part 3',
	54	'uploader': 'anyclip',
	55	'description': 'md5:38c711dd98f5bb87acf973d573442e67',
	56	},
66cf3ac3	57	},
be3b8fa3 S	58	# age-restricted video
	59	{
	60	'url': 'http://www.metacafe.com/watch/5186653/bbc_internal_christmas_tape_79_uncensored_outtakes_etc/',
	61	'md5': '98dde7c1a35d02178e8ab7560fe8bd09',
	62	'info_dict': {
	63	'id': '5186653',
	64	'ext': 'mp4',
	65	'title': 'BBC INTERNAL Christmas Tape \'79 - UNCENSORED Outtakes, Etc.',
	66	'uploader': 'Dwayne Pipe',
	67	'description': 'md5:950bf4c581e2c059911fa3ffbe377e4b',
	68	'age_limit': 18,
	69	},
b9a2c538	70	},
be3b8fa3 S	71	# cbs video
be3b8fa3 S	72	{
391dc3ee	73	'url': 'http://www.metacafe.com/watch/cb-8VD4r_Zws8VP/open_this_is_face_the_nation_february_9/',
be3b8fa3	74	'info_dict': {
391dc3ee	75	'id': '8VD4r_Zws8VP',
be3b8fa3	76	'ext': 'flv',
391dc3ee S	77	'title': 'Open: This is Face the Nation, February 9',
	78	'description': 'md5:8a9ceec26d1f7ed6eab610834cc1a476',
	79	'duration': 96,
be3b8fa3 S	80	},
	81	'params': {
	82	# rtmp download
	83	'skip_download': True,
	84	},
b9a2c538	85	},
66cf3ac3	86	]
83f6f68e	87
38cbc40a	88	def report_disclaimer(self):
be3b8fa3	89	self.to_screen('Retrieving disclaimer')
38cbc40a PH	90
	91	def _real_initialize(self):
	92	# Retrieve disclaimer
baa7b197	93	self.report_disclaimer()
be3b8fa3	94	self._download_webpage(self._DISCLAIMER, None, False, 'Unable to retrieve disclaimer')
38cbc40a PH	95
	96	# Confirm age
	97	disclaimer_form = {
	98	'filters': '0',
	99	'submit': "Continue - I'm over 18",
be3b8fa3	100	}
38cbc40a	101	request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
66cf3ac3	102	request.add_header('Content-Type', 'application/x-www-form-urlencoded')
baa7b197	103	self.report_age_confirmation()
be3b8fa3	104	self._download_webpage(request, None, False, 'Unable to confirm age')
2c139607	105
38cbc40a PH	106	def _real_extract(self, url):
	107	# Extract id and simplified title from URL
	108	mobj = re.match(self._VALID_URL, url)
	109	if mobj is None:
be3b8fa3	110	raise ExtractorError('Invalid URL: %s' % url)
38cbc40a PH	111
	112	video_id = mobj.group(1)
	113
b9a2c538 JMF	114	# the video may come from an external site
	115	m_external = re.match('^(\w{2})-(.*)$', video_id)
	116	if m_external is not None:
	117	prefix, ext_id = m_external.groups()
	118	# Check if video comes from YouTube
	119	if prefix == 'yt':
	120	return self.url_result('http://www.youtube.com/watch?v=%s' % ext_id, 'Youtube')
	121	# CBS videos use theplatform.com
	122	if prefix == 'cb':
	123	return self.url_result('theplatform:%s' % ext_id, 'ThePlatform')
38cbc40a PH	124
38cbc40a PH	125	# Retrieve video webpage to extract further information
896d5b63	126	req = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
66cf3ac3 JMF	127
	128	# AnyClip videos require the flashversion cookie so that we get the link
	129	# to the mp4 file
	130	mobj_an = re.match(r'^an-(.*?)$', video_id)
	131	if mobj_an:
	132	req.headers['Cookie'] = 'flashVersion=0;'
896d5b63	133	webpage = self._download_webpage(req, video_id)
38cbc40a PH	134
	135	# Extract URL, uploader and title from webpage
	136	self.report_extraction(video_id)
	137	mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
	138	if mobj is not None:
	139	mediaURL = compat_urllib_parse.unquote(mobj.group(1))
896d5b63	140	video_ext = mediaURL[-3:]
38cbc40a PH	141
	142	# Extract gdaKey if available
	143	mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
	144	if mobj is None:
	145	video_url = mediaURL
	146	else:
	147	gdaKey = mobj.group(1)
	148	video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
	149	else:
896d5b63 PH	150	mobj = re.search(r'<video src="([^"]+)"', webpage)
	151	if mobj:
	152	video_url = mobj.group(1)
	153	video_ext = 'mp4'
	154	else:
12c3ec33 PH	155	flashvars = self._search_regex(
	156	r' name="flashvars" value="(.*?)"', webpage, 'flashvars')
	157	vardict = compat_parse_qs(flashvars)
896d5b63	158	if 'mediaData' not in vardict:
be3b8fa3 S	159	raise ExtractorError('Unable to extract media URL')
	160	mobj = re.search(
	161	r'"mediaURL":"(?P<mediaURL>http.?)",(.?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
896d5b63	162	if mobj is None:
be3b8fa3	163	raise ExtractorError('Unable to extract media URL')
896d5b63 PH	164	mediaURL = mobj.group('mediaURL').replace('\\/', '/')
	165	video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
	166	video_ext = determine_ext(video_url)
38cbc40a	167
12c3ec33 PH	168	video_title = self._html_search_regex(
12c3ec33 PH	169	r'(?im)<title>(.*) - Video</title>', webpage, 'title')
7e24b09d	170	description = self._og_search_description(webpage)
54ab1939	171	thumbnail = self._og_search_thumbnail(webpage)
f085f960	172	video_uploader = self._html_search_regex(
6c758d79	173	r'submitter=(.*?);\|googletag\.pubads\(\)\.setTargeting\("(?:channel\|submiter)","([^"]+)"\);',
be3b8fa3	174	webpage, 'uploader nickname', fatal=False)
38cbc40a	175
66cf3ac3 JMF	176	if re.search(r'"contentRating":"restricted"', webpage) is not None:
	177	age_limit = 18
	178	else:
	179	age_limit = 0
	180
5910724b	181	return {
be3b8fa3 S	182	'id': video_id,
be3b8fa3 S	183	'url': video_url,
7e24b09d	184	'description': description,
896d5b63	185	'uploader': video_uploader,
be3b8fa3	186	'title': video_title,
12c3ec33	187	'thumbnail': thumbnail,
be3b8fa3	188	'ext': video_ext,
66cf3ac3	189	'age_limit': age_limit,
5910724b	190	}