[yt-dlp.git] / youtube_dl / extractor / facebook.py

import json
import re
import socket

from .common import InfoExtractor
from ..utils import (
    compat_http_client,
    compat_str,
    compat_urllib_error,
    compat_urllib_parse,
    compat_urllib_request,

    ExtractorError,
)


class FacebookIE(InfoExtractor):
    """Information Extractor for Facebook"""

    _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:[^#?]*#!/)?(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
    _LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1'
    _CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1'
    _NETRC_MACHINE = 'facebook'
    IE_NAME = u'facebook'
    _TEST = {
        u'url': u'https://www.facebook.com/photo.php?v=120708114770723',
        u'file': u'120708114770723.mp4',
        u'md5': u'48975a41ccc4b7a581abd68651c1a5a8',
        u'info_dict': {
            u"duration": 279,
            u"title": u"PEOPLE ARE AWESOME 2013"
        }
    }

    def report_login(self):
        """Report attempt to log in."""
        self.to_screen(u'Logging in')

    def _login(self):
        (useremail, password) = self._get_login_info()
        if useremail is None:
            return

        login_page_req = compat_urllib_request.Request(self._LOGIN_URL)
        login_page_req.add_header('Cookie', 'locale=en_US')
        self.report_login()
        login_page = self._download_webpage(login_page_req, None, note=False,
            errnote=u'Unable to download login page')
        lsd = self._search_regex(r'"lsd":"(\w*?)"', login_page, u'lsd')
        lgnrnd = self._search_regex(r'name="lgnrnd" value="([^"]*?)"', login_page, u'lgnrnd')

        login_form = {
            'email': useremail,
            'pass': password,
            'lsd': lsd,
            'lgnrnd': lgnrnd,
            'next': 'http://facebook.com/home.php',
            'default_persistent': '0',
            'legacy_return': '1',
            'timezone': '-60',
            'trynum': '1',
            }
        request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
        request.add_header('Content-Type', 'application/x-www-form-urlencoded')
        try:
            login_results = compat_urllib_request.urlopen(request).read()
            if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
                self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
                return

            check_form = {
                'fb_dtsg': self._search_regex(r'"fb_dtsg":"(.*?)"', login_results, u'fb_dtsg'),
                'nh': self._search_regex(r'name="nh" value="(\w*?)"', login_results, u'nh'),
                'name_action_selected': 'dont_save',
                'submit[Continue]': self._search_regex(r'<input value="(.*?)" name="submit\[Continue\]"', login_results, u'continue'),
            }
            check_req = compat_urllib_request.Request(self._CHECKPOINT_URL, compat_urllib_parse.urlencode(check_form))
            check_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
            check_response = compat_urllib_request.urlopen(check_req).read()
            if re.search(r'id="checkpointSubmitButton"', check_response) is not None:
                self._downloader.report_warning(u'Unable to confirm login, you have to login in your brower and authorize the login.')
        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
            self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
            return

    def _real_initialize(self):
        self._login()

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        if mobj is None:
            raise ExtractorError(u'Invalid URL: %s' % url)
        video_id = mobj.group('ID')

        url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
        webpage = self._download_webpage(url, video_id)

        BEFORE = '{swf.addParam(param[0], param[1]);});\n'
        AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
        m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
        if not m:
            m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*"><div>(.*?)</div>', webpage)
            if m_msg is not None:
                raise ExtractorError(
                    u'The video is not available, Facebook said: "%s"' % m_msg.group(1),
                    expected=True)
            else:
                raise ExtractorError(u'Cannot parse data')
        data = dict(json.loads(m.group(1)))
        params_raw = compat_urllib_parse.unquote(data['params'])
        params = json.loads(params_raw)
        video_data = params['video_data'][0]
        video_url = video_data.get('hd_src')
        if not video_url:
            video_url = video_data['sd_src']
        if not video_url:
            raise ExtractorError(u'Cannot find video URL')
        video_duration = int(video_data['video_duration'])
        thumbnail = video_data['thumbnail_src']

        video_title = self._html_search_regex(
            r'<h2 class="uiHeaderTitle">([^<]*)</h2>', webpage, u'title')

        info = {
            'id': video_id,
            'title': video_title,
            'url': video_url,
            'ext': 'mp4',
            'duration': video_duration,
            'thumbnail': thumbnail,
        }
        return [info]
Commit	Line	Data
9eae41dd	1	import json
9eae41dd PH	2	import re
	3	import socket
	4
	5	from .common import InfoExtractor
	6	from ..utils import (
	7	compat_http_client,
	8	compat_str,
	9	compat_urllib_error,
	10	compat_urllib_parse,
	11	compat_urllib_request,
	12
	13	ExtractorError,
	14	)
	15
	16
	17	class FacebookIE(InfoExtractor):
	18	"""Information Extractor for Facebook"""
	19
8c8e3eec	20	_VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:[^#?]#!/)?(?:video/video\|photo)\.php\?(?:.?)v=(?P<ID>\d+)(?:.*)'
67874aef JMF	21	_LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1'
67874aef JMF	22	_CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1'
9eae41dd PH	23	_NETRC_MACHINE = 'facebook'
9eae41dd PH	24	IE_NAME = u'facebook'
6f5ac90c PH	25	_TEST = {
	26	u'url': u'https://www.facebook.com/photo.php?v=120708114770723',
	27	u'file': u'120708114770723.mp4',
	28	u'md5': u'48975a41ccc4b7a581abd68651c1a5a8',
	29	u'info_dict': {
8c8e3eec	30	u"duration": 279,
6f5ac90c PH	31	u"title": u"PEOPLE ARE AWESOME 2013"
	32	}
	33	}
9eae41dd PH	34
	35	def report_login(self):
	36	"""Report attempt to log in."""
	37	self.to_screen(u'Logging in')
	38
67874aef JMF	39	def _login(self):
67874aef JMF	40	(useremail, password) = self._get_login_info()
9eae41dd PH	41	if useremail is None:
	42	return
	43
67874aef JMF	44	login_page_req = compat_urllib_request.Request(self._LOGIN_URL)
	45	login_page_req.add_header('Cookie', 'locale=en_US')
	46	self.report_login()
	47	login_page = self._download_webpage(login_page_req, None, note=False,
	48	errnote=u'Unable to download login page')
	49	lsd = self._search_regex(r'"lsd":"(\w*?)"', login_page, u'lsd')
	50	lgnrnd = self._search_regex(r'name="lgnrnd" value="([^"]*?)"', login_page, u'lgnrnd')
	51
9eae41dd PH	52	login_form = {
	53	'email': useremail,
	54	'pass': password,
67874aef JMF	55	'lsd': lsd,
	56	'lgnrnd': lgnrnd,
	57	'next': 'http://facebook.com/home.php',
	58	'default_persistent': '0',
	59	'legacy_return': '1',
	60	'timezone': '-60',
	61	'trynum': '1',
9eae41dd PH	62	}
9eae41dd PH	63	request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
67874aef	64	request.add_header('Content-Type', 'application/x-www-form-urlencoded')
9eae41dd	65	try:
9eae41dd PH	66	login_results = compat_urllib_request.urlopen(request).read()
	67	if re.search(r'<form(.)name="login"(.)</form>', login_results) is not None:
	68	self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
	69	return
67874aef JMF	70
	71	check_form = {
	72	'fb_dtsg': self._search_regex(r'"fb_dtsg":"(.*?)"', login_results, u'fb_dtsg'),
	73	'nh': self._search_regex(r'name="nh" value="(\w*?)"', login_results, u'nh'),
	74	'name_action_selected': 'dont_save',
	75	'submit[Continue]': self._search_regex(r'<input value="(.*?)" name="submit\[Continue\]"', login_results, u'continue'),
	76	}
	77	check_req = compat_urllib_request.Request(self._CHECKPOINT_URL, compat_urllib_parse.urlencode(check_form))
	78	check_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
	79	check_response = compat_urllib_request.urlopen(check_req).read()
	80	if re.search(r'id="checkpointSubmitButton"', check_response) is not None:
	81	self._downloader.report_warning(u'Unable to confirm login, you have to login in your brower and authorize the login.')
9eae41dd PH	82	except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
	83	self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
	84	return
	85
67874aef JMF	86	def _real_initialize(self):
	87	self._login()
	88
9eae41dd PH	89	def _real_extract(self, url):
	90	mobj = re.match(self._VALID_URL, url)
	91	if mobj is None:
	92	raise ExtractorError(u'Invalid URL: %s' % url)
	93	video_id = mobj.group('ID')
	94
	95	url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
	96	webpage = self._download_webpage(url, video_id)
	97
	98	BEFORE = '{swf.addParam(param[0], param[1]);});\n'
	99	AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
	100	m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
	101	if not m:
6f71ef58 JMF	102	m_msg = re.search(r'class="[^"]uiInterstitialContent[^"]"><div>(.*?)</div>', webpage)
6f71ef58 JMF	103	if m_msg is not None:
749a4fd2 JMF	104	raise ExtractorError(
	105	u'The video is not available, Facebook said: "%s"' % m_msg.group(1),
	106	expected=True)
6f71ef58	107	else:
749a4fd2	108	raise ExtractorError(u'Cannot parse data')
9eae41dd PH	109	data = dict(json.loads(m.group(1)))
	110	params_raw = compat_urllib_parse.unquote(data['params'])
	111	params = json.loads(params_raw)
	112	video_data = params['video_data'][0]
	113	video_url = video_data.get('hd_src')
	114	if not video_url:
	115	video_url = video_data['sd_src']
	116	if not video_url:
	117	raise ExtractorError(u'Cannot find video URL')
	118	video_duration = int(video_data['video_duration'])
	119	thumbnail = video_data['thumbnail_src']
	120
81ec7c79 PH	121	video_title = self._html_search_regex(
81ec7c79 PH	122	r'<h2 class="uiHeaderTitle">([^<]*)</h2>', webpage, u'title')
9eae41dd PH	123
	124	info = {
	125	'id': video_id,
	126	'title': video_title,
	127	'url': video_url,
	128	'ext': 'mp4',
	129	'duration': video_duration,
	130	'thumbnail': thumbnail,
	131	}
	132	return [info]