[yt-dlp.git] / youtube_dl / extractor / pornhub.py

from __future__ import unicode_literals

import os
import re

from .common import InfoExtractor
from ..compat import (
    compat_urllib_parse,
    compat_urllib_parse_urlparse,
    compat_urllib_request,
)
from ..utils import (
    ExtractorError,
    str_to_int,
)
from ..aes import (
    aes_decrypt_text
)


class PornHubIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)(?P<id>[0-9a-f]+)'
    _TEST = {
        'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
        'md5': '882f488fa1f0026f023f33576004a2ed',
        'info_dict': {
            'id': '648719015',
            'ext': 'mp4',
            "uploader": "Babes",
            "title": "Seductive Indian beauty strips down and fingers her pink pussy",
            "age_limit": 18
        }
    }

    @classmethod
    def _extract_url(cls, webpage):
        mobj = re.search(
            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?pornhub\.com/embed/\d+)\1', webpage)
        if mobj:
            return mobj.group('url')

    def _extract_count(self, pattern, webpage, name):
        return str_to_int(self._search_regex(
            pattern, webpage, '%s count' % name, fatal=False))

    def _real_extract(self, url):
        video_id = self._match_id(url)

        req = compat_urllib_request.Request(
            'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id)
        req.add_header('Cookie', 'age_verified=1')
        webpage = self._download_webpage(req, video_id)

        error_msg = self._html_search_regex(
            r'(?s)<div class="userMessageSection[^"]*".*?>(.*?)</div>',
            webpage, 'error message', default=None)
        if error_msg:
            error_msg = re.sub(r'\s+', ' ', error_msg)
            raise ExtractorError(
                'PornHub said: %s' % error_msg,
                expected=True, video_id=video_id)

        video_title = self._html_search_regex(r'<h1 [^>]+>([^<]+)', webpage, 'title')
        video_uploader = self._html_search_regex(
            r'(?s)From:&nbsp;.+?<(?:a href="/users/|a href="/channels/|span class="username)[^>]+>(.+?)<',
            webpage, 'uploader', fatal=False)
        thumbnail = self._html_search_regex(r'"image_url":"([^"]+)', webpage, 'thumbnail', fatal=False)
        if thumbnail:
            thumbnail = compat_urllib_parse.unquote(thumbnail)

        view_count = self._extract_count(
            r'<span class="count">([\d,\.]+)</span> views', webpage, 'view')
        like_count = self._extract_count(
            r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like')
        dislike_count = self._extract_count(
            r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike')
        comment_count = self._extract_count(
            r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')

        video_urls = list(map(compat_urllib_parse.unquote, re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage)))
        if webpage.find('"encrypted":true') != -1:
            password = compat_urllib_parse.unquote_plus(
                self._search_regex(r'"video_title":"([^"]+)', webpage, 'password'))
            video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls))

        formats = []
        for video_url in video_urls:
            path = compat_urllib_parse_urlparse(video_url).path
            extension = os.path.splitext(path)[1][1:]
            format = path.split('/')[5].split('_')[:2]
            format = "-".join(format)

            m = re.match(r'^(?P<height>[0-9]+)P-(?P<tbr>[0-9]+)K$', format)
            if m is None:
                height = None
                tbr = None
            else:
                height = int(m.group('height'))
                tbr = int(m.group('tbr'))

            formats.append({
                'url': video_url,
                'ext': extension,
                'format': format,
                'format_id': format,
                'tbr': tbr,
                'height': height,
            })
        self._sort_formats(formats)

        return {
            'id': video_id,
            'uploader': video_uploader,
            'title': video_title,
            'thumbnail': thumbnail,
            'view_count': view_count,
            'like_count': like_count,
            'dislike_count': dislike_count,
            'comment_count': comment_count,
            'formats': formats,
            'age_limit': 18,
        }


class PornHubPlaylistIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?pornhub\.com/playlist/(?P<id>\d+)'
    _TESTS = [{
        'url': 'http://www.pornhub.com/playlist/6201671',
        'info_dict': {
            'id': '6201671',
            'title': 'P0p4',
        },
        'playlist_mincount': 35,
    }]

    def _real_extract(self, url):
        playlist_id = self._match_id(url)

        webpage = self._download_webpage(url, playlist_id)

        entries = [
            self.url_result('http://www.pornhub.com/%s' % video_url, 'PornHub')
            for video_url in set(re.findall('href="/?(view_video\.php\?viewkey=\d+[^"]*)"', webpage))
        ]

        playlist = self._parse_json(
            self._search_regex(
                r'playlistObject\s*=\s*({.+?});', webpage, 'playlist'),
            playlist_id)

        return self.playlist_result(
            entries, playlist_id, playlist.get('title'), playlist.get('description'))
Commit	Line	Data
9933b574 PH	1	from __future__ import unicode_literals
9933b574 PH	2
125cfd78	3	import os
	4	import re
	5
	6	from .common import InfoExtractor
1cc79574 PH	7	from ..compat import (
1cc79574 PH	8	compat_urllib_parse,
125cfd78	9	compat_urllib_parse_urlparse,
125cfd78	10	compat_urllib_request,
1cc79574 PH	11	)
1cc79574 PH	12	from ..utils import (
50789175	13	ExtractorError,
0320ddc1	14	str_to_int,
125cfd78	15	)
	16	from ..aes import (
	17	aes_decrypt_text
	18	)
	19
9933b574	20
125cfd78	21	class PornHubIE(InfoExtractor):
9fcbd5db	22	_VALID_URL = r'https?://(?:www\.)?pornhub\.com/(?:view_video\.php\?viewkey=\|embed/)(?P<id>[0-9a-f]+)'
125cfd78	23	_TEST = {
9933b574	24	'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
9933b574 PH	25	'md5': '882f488fa1f0026f023f33576004a2ed',
9933b574 PH	26	'info_dict': {
249efaf4 PH	27	'id': '648719015',
	28	'ext': 'mp4',
	29	"uploader": "Babes",
9933b574 PH	30	"title": "Seductive Indian beauty strips down and fingers her pink pussy",
9933b574 PH	31	"age_limit": 18
125cfd78	32	}
	33	}
	34
65d161c4 S	35	@classmethod
	36	def _extract_url(cls, webpage):
	37	mobj = re.search(
	38	r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?pornhub\.com/embed/\d+)\1', webpage)
	39	if mobj:
	40	return mobj.group('url')
	41
0320ddc1	42	def _extract_count(self, pattern, webpage, name):
7700207e S	43	return str_to_int(self._search_regex(
7700207e S	44	pattern, webpage, '%s count' % name, fatal=False))
0320ddc1	45
125cfd78	46	def _real_extract(self, url):
249efaf4	47	video_id = self._match_id(url)
125cfd78	48
9fcbd5db S	49	req = compat_urllib_request.Request(
9fcbd5db S	50	'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id)
125cfd78	51	req.add_header('Cookie', 'age_verified=1')
	52	webpage = self._download_webpage(req, video_id)
	53
50789175 PH	54	error_msg = self._html_search_regex(
	55	r'(?s)<div class="userMessageSection[^"]".?>(.*?)</div>',
	56	webpage, 'error message', default=None)
	57	if error_msg:
	58	error_msg = re.sub(r'\s+', ' ', error_msg)
	59	raise ExtractorError(
	60	'PornHub said: %s' % error_msg,
	61	expected=True, video_id=video_id)
	62
9933b574	63	video_title = self._html_search_regex(r'<h1 [^>]+>([^<]+)', webpage, 'title')
0320ddc1	64	video_uploader = self._html_search_regex(
8fc642eb	65	r'(?s)From: .+?<(?:a href="/users/\|a href="/channels/\|span class="username)[^>]+>(.+?)<',
0320ddc1	66	webpage, 'uploader', fatal=False)
9933b574	67	thumbnail = self._html_search_regex(r'"image_url":"([^"]+)', webpage, 'thumbnail', fatal=False)
125cfd78	68	if thumbnail:
	69	thumbnail = compat_urllib_parse.unquote(thumbnail)
	70
7700207e S	71	view_count = self._extract_count(
	72	r'<span class="count">([\d,\.]+)</span> views', webpage, 'view')
	73	like_count = self._extract_count(
	74	r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like')
	75	dislike_count = self._extract_count(
	76	r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike')
0320ddc1	77	comment_count = self._extract_count(
7700207e	78	r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')
0320ddc1	79
5f6a1245	80	video_urls = list(map(compat_urllib_parse.unquote, re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage)))
125cfd78	81	if webpage.find('"encrypted":true') != -1:
7a372b64 S	82	password = compat_urllib_parse.unquote_plus(
7a372b64 S	83	self._search_regex(r'"video_title":"([^"]+)', webpage, 'password'))
125cfd78	84	video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls))
	85
	86	formats = []
	87	for video_url in video_urls:
a56f9de1 JMF	88	path = compat_urllib_parse_urlparse(video_url).path
a56f9de1 JMF	89	extension = os.path.splitext(path)[1][1:]
125cfd78	90	format = path.split('/')[5].split('_')[:2]
a56f9de1	91	format = "-".join(format)
9933b574 PH	92
	93	m = re.match(r'^(?P<height>[0-9]+)P-(?P<tbr>[0-9]+)K$', format)
	94	if m is None:
	95	height = None
	96	tbr = None
	97	else:
	98	height = int(m.group('height'))
	99	tbr = int(m.group('tbr'))
	100
125cfd78	101	formats.append({
	102	'url': video_url,
	103	'ext': extension,
	104	'format': format,
	105	'format_id': format,
9933b574 PH	106	'tbr': tbr,
9933b574 PH	107	'height': height,
125cfd78	108	})
9933b574	109	self._sort_formats(formats)
125cfd78	110
	111	return {
	112	'id': video_id,
	113	'uploader': video_uploader,
	114	'title': video_title,
	115	'thumbnail': thumbnail,
0320ddc1 S	116	'view_count': view_count,
	117	'like_count': like_count,
	118	'dislike_count': dislike_count,
	119	'comment_count': comment_count,
125cfd78	120	'formats': formats,
750e9833	121	'age_limit': 18,
125cfd78	122	}
e66e1a00 S	123
	124
	125	class PornHubPlaylistIE(InfoExtractor):
	126	_VALID_URL = r'https?://(?:www\.)?pornhub\.com/playlist/(?P<id>\d+)'
	127	_TESTS = [{
	128	'url': 'http://www.pornhub.com/playlist/6201671',
	129	'info_dict': {
	130	'id': '6201671',
	131	'title': 'P0p4',
	132	},
	133	'playlist_mincount': 35,
	134	}]
	135
	136	def _real_extract(self, url):
	137	playlist_id = self._match_id(url)
	138
	139	webpage = self._download_webpage(url, playlist_id)
	140
	141	entries = [
	142	self.url_result('http://www.pornhub.com/%s' % video_url, 'PornHub')
	143	for video_url in set(re.findall('href="/?(view_video\.php\?viewkey=\d+[^"]*)"', webpage))
	144	]
	145
	146	playlist = self._parse_json(
	147	self._search_regex(
	148	r'playlistObject\s=\s({.+?});', webpage, 'playlist'),
	149	playlist_id)
	150
	151	return self.playlist_result(
	152	entries, playlist_id, playlist.get('title'), playlist.get('description'))