[yt-dlp.git] / youtube_dl / extractor / pornhub.py

from __future__ import unicode_literals

import os
import re

from .common import InfoExtractor
from ..compat import (
    compat_urllib_parse,
    compat_urllib_parse_urlparse,
    compat_urllib_request,
)
from ..utils import (
    ExtractorError,
    str_to_int,
)
from ..aes import (
    aes_decrypt_text
)


class PornHubIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)(?P<id>[0-9a-z]+)'
    _TESTS = [{
        'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
        'md5': '882f488fa1f0026f023f33576004a2ed',
        'info_dict': {
            'id': '648719015',
            'ext': 'mp4',
            "uploader": "Babes",
            "title": "Seductive Indian beauty strips down and fingers her pink pussy",
            "age_limit": 18
        }
    }, {
        'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',
        'only_matching': True,
    }]

    @classmethod
    def _extract_url(cls, webpage):
        mobj = re.search(
            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?pornhub\.com/embed/\d+)\1', webpage)
        if mobj:
            return mobj.group('url')

    def _extract_count(self, pattern, webpage, name):
        return str_to_int(self._search_regex(
            pattern, webpage, '%s count' % name, fatal=False))

    def _real_extract(self, url):
        video_id = self._match_id(url)

        req = compat_urllib_request.Request(
            'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id)
        req.add_header('Cookie', 'age_verified=1')
        webpage = self._download_webpage(req, video_id)

        error_msg = self._html_search_regex(
            r'(?s)<div class="userMessageSection[^"]*".*?>(.*?)</div>',
            webpage, 'error message', default=None)
        if error_msg:
            error_msg = re.sub(r'\s+', ' ', error_msg)
            raise ExtractorError(
                'PornHub said: %s' % error_msg,
                expected=True, video_id=video_id)

        video_title = self._html_search_regex(r'<h1 [^>]+>([^<]+)', webpage, 'title')
        video_uploader = self._html_search_regex(
            r'(?s)From:&nbsp;.+?<(?:a href="/users/|a href="/channels/|span class="username)[^>]+>(.+?)<',
            webpage, 'uploader', fatal=False)
        thumbnail = self._html_search_regex(r'"image_url":"([^"]+)', webpage, 'thumbnail', fatal=False)
        if thumbnail:
            thumbnail = compat_urllib_parse.unquote(thumbnail)

        view_count = self._extract_count(
            r'<span class="count">([\d,\.]+)</span> views', webpage, 'view')
        like_count = self._extract_count(
            r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like')
        dislike_count = self._extract_count(
            r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike')
        comment_count = self._extract_count(
            r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')

        video_urls = list(map(compat_urllib_parse.unquote, re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage)))
        if webpage.find('"encrypted":true') != -1:
            password = compat_urllib_parse.unquote_plus(
                self._search_regex(r'"video_title":"([^"]+)', webpage, 'password'))
            video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls))

        formats = []
        for video_url in video_urls:
            path = compat_urllib_parse_urlparse(video_url).path
            extension = os.path.splitext(path)[1][1:]
            format = path.split('/')[5].split('_')[:2]
            format = "-".join(format)

            m = re.match(r'^(?P<height>[0-9]+)P-(?P<tbr>[0-9]+)K$', format)
            if m is None:
                height = None
                tbr = None
            else:
                height = int(m.group('height'))
                tbr = int(m.group('tbr'))

            formats.append({
                'url': video_url,
                'ext': extension,
                'format': format,
                'format_id': format,
                'tbr': tbr,
                'height': height,
            })
        self._sort_formats(formats)

        return {
            'id': video_id,
            'uploader': video_uploader,
            'title': video_title,
            'thumbnail': thumbnail,
            'view_count': view_count,
            'like_count': like_count,
            'dislike_count': dislike_count,
            'comment_count': comment_count,
            'formats': formats,
            'age_limit': 18,
        }


class PornHubPlaylistIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?pornhub\.com/playlist/(?P<id>\d+)'
    _TESTS = [{
        'url': 'http://www.pornhub.com/playlist/6201671',
        'info_dict': {
            'id': '6201671',
            'title': 'P0p4',
        },
        'playlist_mincount': 35,
    }]

    def _real_extract(self, url):
        playlist_id = self._match_id(url)

        webpage = self._download_webpage(url, playlist_id)

        entries = [
            self.url_result('http://www.pornhub.com/%s' % video_url, 'PornHub')
            for video_url in set(re.findall('href="/?(view_video\.php\?viewkey=\d+[^"]*)"', webpage))
        ]

        playlist = self._parse_json(
            self._search_regex(
                r'playlistObject\s*=\s*({.+?});', webpage, 'playlist'),
            playlist_id)

        return self.playlist_result(
            entries, playlist_id, playlist.get('title'), playlist.get('description'))
Commit	Line	Data
9933b574 PH	1	from __future__ import unicode_literals
9933b574 PH	2
125cfd78	3	import os
	4	import re
	5
	6	from .common import InfoExtractor
1cc79574 PH	7	from ..compat import (
1cc79574 PH	8	compat_urllib_parse,
125cfd78	9	compat_urllib_parse_urlparse,
125cfd78	10	compat_urllib_request,
1cc79574 PH	11	)
1cc79574 PH	12	from ..utils import (
50789175	13	ExtractorError,
0320ddc1	14	str_to_int,
125cfd78	15	)
	16	from ..aes import (
	17	aes_decrypt_text
	18	)
	19
9933b574	20
125cfd78	21	class PornHubIE(InfoExtractor):
360075e2 S	22	_VALID_URL = r'https?://(?:www\.)?pornhub\.com/(?:view_video\.php\?viewkey=\|embed/)(?P<id>[0-9a-z]+)'
360075e2 S	23	_TESTS = [{
9933b574	24	'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
9933b574 PH	25	'md5': '882f488fa1f0026f023f33576004a2ed',
9933b574 PH	26	'info_dict': {
249efaf4 PH	27	'id': '648719015',
	28	'ext': 'mp4',
	29	"uploader": "Babes",
9933b574 PH	30	"title": "Seductive Indian beauty strips down and fingers her pink pussy",
9933b574 PH	31	"age_limit": 18
125cfd78	32	}
360075e2 S	33	}, {
	34	'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',
	35	'only_matching': True,
	36	}]
125cfd78	37
65d161c4 S	38	@classmethod
	39	def _extract_url(cls, webpage):
	40	mobj = re.search(
	41	r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?pornhub\.com/embed/\d+)\1', webpage)
	42	if mobj:
	43	return mobj.group('url')
	44
0320ddc1	45	def _extract_count(self, pattern, webpage, name):
7700207e S	46	return str_to_int(self._search_regex(
7700207e S	47	pattern, webpage, '%s count' % name, fatal=False))
0320ddc1	48
125cfd78	49	def _real_extract(self, url):
249efaf4	50	video_id = self._match_id(url)
125cfd78	51
9fcbd5db S	52	req = compat_urllib_request.Request(
9fcbd5db S	53	'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id)
125cfd78	54	req.add_header('Cookie', 'age_verified=1')
	55	webpage = self._download_webpage(req, video_id)
	56
50789175 PH	57	error_msg = self._html_search_regex(
	58	r'(?s)<div class="userMessageSection[^"]".?>(.*?)</div>',
	59	webpage, 'error message', default=None)
	60	if error_msg:
	61	error_msg = re.sub(r'\s+', ' ', error_msg)
	62	raise ExtractorError(
	63	'PornHub said: %s' % error_msg,
	64	expected=True, video_id=video_id)
	65
9933b574	66	video_title = self._html_search_regex(r'<h1 [^>]+>([^<]+)', webpage, 'title')
0320ddc1	67	video_uploader = self._html_search_regex(
8fc642eb	68	r'(?s)From: .+?<(?:a href="/users/\|a href="/channels/\|span class="username)[^>]+>(.+?)<',
0320ddc1	69	webpage, 'uploader', fatal=False)
9933b574	70	thumbnail = self._html_search_regex(r'"image_url":"([^"]+)', webpage, 'thumbnail', fatal=False)
125cfd78	71	if thumbnail:
	72	thumbnail = compat_urllib_parse.unquote(thumbnail)
	73
7700207e S	74	view_count = self._extract_count(
	75	r'<span class="count">([\d,\.]+)</span> views', webpage, 'view')
	76	like_count = self._extract_count(
	77	r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like')
	78	dislike_count = self._extract_count(
	79	r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike')
0320ddc1	80	comment_count = self._extract_count(
7700207e	81	r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')
0320ddc1	82
5f6a1245	83	video_urls = list(map(compat_urllib_parse.unquote, re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage)))
125cfd78	84	if webpage.find('"encrypted":true') != -1:
7a372b64 S	85	password = compat_urllib_parse.unquote_plus(
7a372b64 S	86	self._search_regex(r'"video_title":"([^"]+)', webpage, 'password'))
125cfd78	87	video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls))
	88
	89	formats = []
	90	for video_url in video_urls:
a56f9de1 JMF	91	path = compat_urllib_parse_urlparse(video_url).path
a56f9de1 JMF	92	extension = os.path.splitext(path)[1][1:]
125cfd78	93	format = path.split('/')[5].split('_')[:2]
a56f9de1	94	format = "-".join(format)
9933b574 PH	95
	96	m = re.match(r'^(?P<height>[0-9]+)P-(?P<tbr>[0-9]+)K$', format)
	97	if m is None:
	98	height = None
	99	tbr = None
	100	else:
	101	height = int(m.group('height'))
	102	tbr = int(m.group('tbr'))
	103
125cfd78	104	formats.append({
	105	'url': video_url,
	106	'ext': extension,
	107	'format': format,
	108	'format_id': format,
9933b574 PH	109	'tbr': tbr,
9933b574 PH	110	'height': height,
125cfd78	111	})
9933b574	112	self._sort_formats(formats)
125cfd78	113
	114	return {
	115	'id': video_id,
	116	'uploader': video_uploader,
	117	'title': video_title,
	118	'thumbnail': thumbnail,
0320ddc1 S	119	'view_count': view_count,
	120	'like_count': like_count,
	121	'dislike_count': dislike_count,
	122	'comment_count': comment_count,
125cfd78	123	'formats': formats,
750e9833	124	'age_limit': 18,
125cfd78	125	}
e66e1a00 S	126
	127
	128	class PornHubPlaylistIE(InfoExtractor):
	129	_VALID_URL = r'https?://(?:www\.)?pornhub\.com/playlist/(?P<id>\d+)'
	130	_TESTS = [{
	131	'url': 'http://www.pornhub.com/playlist/6201671',
	132	'info_dict': {
	133	'id': '6201671',
	134	'title': 'P0p4',
	135	},
	136	'playlist_mincount': 35,
	137	}]
	138
	139	def _real_extract(self, url):
	140	playlist_id = self._match_id(url)
	141
	142	webpage = self._download_webpage(url, playlist_id)
	143
	144	entries = [
	145	self.url_result('http://www.pornhub.com/%s' % video_url, 'PornHub')
	146	for video_url in set(re.findall('href="/?(view_video\.php\?viewkey=\d+[^"]*)"', webpage))
	147	]
	148
	149	playlist = self._parse_json(
	150	self._search_regex(
	151	r'playlistObject\s=\s({.+?});', webpage, 'playlist'),
	152	playlist_id)
	153
	154	return self.playlist_result(
	155	entries, playlist_id, playlist.get('title'), playlist.get('description'))