[yt-dlp.git] / youtube_dl / extractor / pornhub.py

from __future__ import unicode_literals

import os
import re

from .common import InfoExtractor
from ..compat import (
    compat_urllib_parse_unquote,
    compat_urllib_parse_unquote_plus,
    compat_urllib_parse_urlparse,
    compat_urllib_request,
)
from ..utils import (
    ExtractorError,
    str_to_int,
)
from ..aes import (
    aes_decrypt_text
)


class PornHubIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)(?P<id>[0-9a-z]+)'
    _TESTS = [{
        'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
        'md5': '882f488fa1f0026f023f33576004a2ed',
        'info_dict': {
            'id': '648719015',
            'ext': 'mp4',
            "uploader": "Babes",
            "title": "Seductive Indian beauty strips down and fingers her pink pussy",
            "age_limit": 18
        }
    }, {
        'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',
        'only_matching': True,
    }]

    @classmethod
    def _extract_url(cls, webpage):
        mobj = re.search(
            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?pornhub\.com/embed/\d+)\1', webpage)
        if mobj:
            return mobj.group('url')

    def _extract_count(self, pattern, webpage, name):
        return str_to_int(self._search_regex(
            pattern, webpage, '%s count' % name, fatal=False))

    def _real_extract(self, url):
        video_id = self._match_id(url)

        req = compat_urllib_request.Request(
            'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id)
        req.add_header('Cookie', 'age_verified=1')
        webpage = self._download_webpage(req, video_id)

        error_msg = self._html_search_regex(
            r'(?s)<div class="userMessageSection[^"]*".*?>(.*?)</div>',
            webpage, 'error message', default=None)
        if error_msg:
            error_msg = re.sub(r'\s+', ' ', error_msg)
            raise ExtractorError(
                'PornHub said: %s' % error_msg,
                expected=True, video_id=video_id)

        video_title = self._html_search_regex(r'<h1 [^>]+>([^<]+)', webpage, 'title')
        video_uploader = self._html_search_regex(
            r'(?s)From:&nbsp;.+?<(?:a href="/users/|a href="/channels/|span class="username)[^>]+>(.+?)<',
            webpage, 'uploader', fatal=False)
        thumbnail = self._html_search_regex(r'"image_url":"([^"]+)', webpage, 'thumbnail', fatal=False)
        if thumbnail:
            thumbnail = compat_urllib_parse_unquote(thumbnail)

        view_count = self._extract_count(
            r'<span class="count">([\d,\.]+)</span> views', webpage, 'view')
        like_count = self._extract_count(
            r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like')
        dislike_count = self._extract_count(
            r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike')
        comment_count = self._extract_count(
            r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')

        video_urls = list(map(compat_urllib_parse_unquote, re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage)))
        if webpage.find('"encrypted":true') != -1:
            password = compat_urllib_parse_unquote_plus(
                self._search_regex(r'"video_title":"([^"]+)', webpage, 'password'))
            video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls))

        formats = []
        for video_url in video_urls:
            path = compat_urllib_parse_urlparse(video_url).path
            extension = os.path.splitext(path)[1][1:]
            format = path.split('/')[5].split('_')[:2]
            format = "-".join(format)

            m = re.match(r'^(?P<height>[0-9]+)P-(?P<tbr>[0-9]+)K$', format)
            if m is None:
                height = None
                tbr = None
            else:
                height = int(m.group('height'))
                tbr = int(m.group('tbr'))

            formats.append({
                'url': video_url,
                'ext': extension,
                'format': format,
                'format_id': format,
                'tbr': tbr,
                'height': height,
            })
        self._sort_formats(formats)

        return {
            'id': video_id,
            'uploader': video_uploader,
            'title': video_title,
            'thumbnail': thumbnail,
            'view_count': view_count,
            'like_count': like_count,
            'dislike_count': dislike_count,
            'comment_count': comment_count,
            'formats': formats,
            'age_limit': 18,
        }


class PornHubPlaylistIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?pornhub\.com/playlist/(?P<id>\d+)'
    _TESTS = [{
        'url': 'http://www.pornhub.com/playlist/6201671',
        'info_dict': {
            'id': '6201671',
            'title': 'P0p4',
        },
        'playlist_mincount': 35,
    }]

    def _real_extract(self, url):
        playlist_id = self._match_id(url)

        webpage = self._download_webpage(url, playlist_id)

        entries = [
            self.url_result('http://www.pornhub.com/%s' % video_url, 'PornHub')
            for video_url in set(re.findall('href="/?(view_video\.php\?viewkey=\d+[^"]*)"', webpage))
        ]

        playlist = self._parse_json(
            self._search_regex(
                r'playlistObject\s*=\s*({.+?});', webpage, 'playlist'),
            playlist_id)

        return self.playlist_result(
            entries, playlist_id, playlist.get('title'), playlist.get('description'))
Commit	Line	Data
9933b574 PH	1	from __future__ import unicode_literals
9933b574 PH	2
125cfd78	3	import os
	4	import re
	5
	6	from .common import InfoExtractor
1cc79574	7	from ..compat import (
605cbef6 S	8	compat_urllib_parse_unquote,
605cbef6 S	9	compat_urllib_parse_unquote_plus,
125cfd78	10	compat_urllib_parse_urlparse,
125cfd78	11	compat_urllib_request,
1cc79574 PH	12	)
1cc79574 PH	13	from ..utils import (
50789175	14	ExtractorError,
0320ddc1	15	str_to_int,
125cfd78	16	)
	17	from ..aes import (
	18	aes_decrypt_text
	19	)
	20
9933b574	21
125cfd78	22	class PornHubIE(InfoExtractor):
360075e2 S	23	_VALID_URL = r'https?://(?:www\.)?pornhub\.com/(?:view_video\.php\?viewkey=\|embed/)(?P<id>[0-9a-z]+)'
360075e2 S	24	_TESTS = [{
9933b574	25	'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
9933b574 PH	26	'md5': '882f488fa1f0026f023f33576004a2ed',
9933b574 PH	27	'info_dict': {
249efaf4 PH	28	'id': '648719015',
	29	'ext': 'mp4',
	30	"uploader": "Babes",
9933b574 PH	31	"title": "Seductive Indian beauty strips down and fingers her pink pussy",
9933b574 PH	32	"age_limit": 18
125cfd78	33	}
360075e2 S	34	}, {
	35	'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',
	36	'only_matching': True,
	37	}]
125cfd78	38
65d161c4 S	39	@classmethod
	40	def _extract_url(cls, webpage):
	41	mobj = re.search(
	42	r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?pornhub\.com/embed/\d+)\1', webpage)
	43	if mobj:
	44	return mobj.group('url')
	45
0320ddc1	46	def _extract_count(self, pattern, webpage, name):
7700207e S	47	return str_to_int(self._search_regex(
7700207e S	48	pattern, webpage, '%s count' % name, fatal=False))
0320ddc1	49
125cfd78	50	def _real_extract(self, url):
249efaf4	51	video_id = self._match_id(url)
125cfd78	52
9fcbd5db S	53	req = compat_urllib_request.Request(
9fcbd5db S	54	'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id)
125cfd78	55	req.add_header('Cookie', 'age_verified=1')
	56	webpage = self._download_webpage(req, video_id)
	57
50789175 PH	58	error_msg = self._html_search_regex(
	59	r'(?s)<div class="userMessageSection[^"]".?>(.*?)</div>',
	60	webpage, 'error message', default=None)
	61	if error_msg:
	62	error_msg = re.sub(r'\s+', ' ', error_msg)
	63	raise ExtractorError(
	64	'PornHub said: %s' % error_msg,
	65	expected=True, video_id=video_id)
	66
9933b574	67	video_title = self._html_search_regex(r'<h1 [^>]+>([^<]+)', webpage, 'title')
0320ddc1	68	video_uploader = self._html_search_regex(
8fc642eb	69	r'(?s)From: .+?<(?:a href="/users/\|a href="/channels/\|span class="username)[^>]+>(.+?)<',
0320ddc1	70	webpage, 'uploader', fatal=False)
9933b574	71	thumbnail = self._html_search_regex(r'"image_url":"([^"]+)', webpage, 'thumbnail', fatal=False)
125cfd78	72	if thumbnail:
605cbef6	73	thumbnail = compat_urllib_parse_unquote(thumbnail)
125cfd78	74
7700207e S	75	view_count = self._extract_count(
	76	r'<span class="count">([\d,\.]+)</span> views', webpage, 'view')
	77	like_count = self._extract_count(
	78	r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like')
	79	dislike_count = self._extract_count(
	80	r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike')
0320ddc1	81	comment_count = self._extract_count(
7700207e	82	r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')
0320ddc1	83
605cbef6	84	video_urls = list(map(compat_urllib_parse_unquote, re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage)))
125cfd78	85	if webpage.find('"encrypted":true') != -1:
605cbef6	86	password = compat_urllib_parse_unquote_plus(
7a372b64	87	self._search_regex(r'"video_title":"([^"]+)', webpage, 'password'))
125cfd78	88	video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls))
	89
	90	formats = []
	91	for video_url in video_urls:
a56f9de1 JMF	92	path = compat_urllib_parse_urlparse(video_url).path
a56f9de1 JMF	93	extension = os.path.splitext(path)[1][1:]
125cfd78	94	format = path.split('/')[5].split('_')[:2]
a56f9de1	95	format = "-".join(format)
9933b574 PH	96
	97	m = re.match(r'^(?P<height>[0-9]+)P-(?P<tbr>[0-9]+)K$', format)
	98	if m is None:
	99	height = None
	100	tbr = None
	101	else:
	102	height = int(m.group('height'))
	103	tbr = int(m.group('tbr'))
	104
125cfd78	105	formats.append({
	106	'url': video_url,
	107	'ext': extension,
	108	'format': format,
	109	'format_id': format,
9933b574 PH	110	'tbr': tbr,
9933b574 PH	111	'height': height,
125cfd78	112	})
9933b574	113	self._sort_formats(formats)
125cfd78	114
	115	return {
	116	'id': video_id,
	117	'uploader': video_uploader,
	118	'title': video_title,
	119	'thumbnail': thumbnail,
0320ddc1 S	120	'view_count': view_count,
	121	'like_count': like_count,
	122	'dislike_count': dislike_count,
	123	'comment_count': comment_count,
125cfd78	124	'formats': formats,
750e9833	125	'age_limit': 18,
125cfd78	126	}
e66e1a00 S	127
	128
	129	class PornHubPlaylistIE(InfoExtractor):
	130	_VALID_URL = r'https?://(?:www\.)?pornhub\.com/playlist/(?P<id>\d+)'
	131	_TESTS = [{
	132	'url': 'http://www.pornhub.com/playlist/6201671',
	133	'info_dict': {
	134	'id': '6201671',
	135	'title': 'P0p4',
	136	},
	137	'playlist_mincount': 35,
	138	}]
	139
	140	def _real_extract(self, url):
	141	playlist_id = self._match_id(url)
	142
	143	webpage = self._download_webpage(url, playlist_id)
	144
	145	entries = [
	146	self.url_result('http://www.pornhub.com/%s' % video_url, 'PornHub')
	147	for video_url in set(re.findall('href="/?(view_video\.php\?viewkey=\d+[^"]*)"', webpage))
	148	]
	149
	150	playlist = self._parse_json(
	151	self._search_regex(
	152	r'playlistObject\s=\s({.+?});', webpage, 'playlist'),
	153	playlist_id)
	154
	155	return self.playlist_result(
	156	entries, playlist_id, playlist.get('title'), playlist.get('description'))