[yt-dlp.git] / youtube_dl / extractor / pornhub.py

from __future__ import unicode_literals

import os
import re

from .common import InfoExtractor
from ..compat import (
    compat_urllib_parse,
    compat_urllib_parse_urlparse,
    compat_urllib_request,
)
from ..utils import (
    ExtractorError,
    str_to_int,
)
from ..aes import (
    aes_decrypt_text
)


class PornHubIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?pornhub\.com/view_video\.php\?viewkey=(?P<id>[0-9a-f]+)'
    _TEST = {
        'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
        'md5': '882f488fa1f0026f023f33576004a2ed',
        'info_dict': {
            'id': '648719015',
            'ext': 'mp4',
            "uploader": "Babes",
            "title": "Seductive Indian beauty strips down and fingers her pink pussy",
            "age_limit": 18
        }
    }

    def _extract_count(self, pattern, webpage, name):
        return str_to_int(self._search_regex(
            pattern, webpage, '%s count' % name, fatal=False))

    def _real_extract(self, url):
        video_id = self._match_id(url)

        req = compat_urllib_request.Request(url)
        req.add_header('Cookie', 'age_verified=1')
        webpage = self._download_webpage(req, video_id)

        error_msg = self._html_search_regex(
            r'(?s)<div class="userMessageSection[^"]*".*?>(.*?)</div>',
            webpage, 'error message', default=None)
        if error_msg:
            error_msg = re.sub(r'\s+', ' ', error_msg)
            raise ExtractorError(
                'PornHub said: %s' % error_msg,
                expected=True, video_id=video_id)

        video_title = self._html_search_regex(r'<h1 [^>]+>([^<]+)', webpage, 'title')
        video_uploader = self._html_search_regex(
            r'(?s)From:&nbsp;.+?<(?:a href="/users/|a href="/channels/|span class="username)[^>]+>(.+?)<',
            webpage, 'uploader', fatal=False)
        thumbnail = self._html_search_regex(r'"image_url":"([^"]+)', webpage, 'thumbnail', fatal=False)
        if thumbnail:
            thumbnail = compat_urllib_parse.unquote(thumbnail)

        view_count = self._extract_count(
            r'<span class="count">([\d,\.]+)</span> views', webpage, 'view')
        like_count = self._extract_count(
            r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like')
        dislike_count = self._extract_count(
            r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike')
        comment_count = self._extract_count(
            r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')

        video_urls = list(map(compat_urllib_parse.unquote, re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage)))
        if webpage.find('"encrypted":true') != -1:
            password = compat_urllib_parse.unquote_plus(self._html_search_regex(r'"video_title":"([^"]+)', webpage, 'password'))
            video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls))

        formats = []
        for video_url in video_urls:
            path = compat_urllib_parse_urlparse(video_url).path
            extension = os.path.splitext(path)[1][1:]
            format = path.split('/')[5].split('_')[:2]
            format = "-".join(format)

            m = re.match(r'^(?P<height>[0-9]+)P-(?P<tbr>[0-9]+)K$', format)
            if m is None:
                height = None
                tbr = None
            else:
                height = int(m.group('height'))
                tbr = int(m.group('tbr'))

            formats.append({
                'url': video_url,
                'ext': extension,
                'format': format,
                'format_id': format,
                'tbr': tbr,
                'height': height,
            })
        self._sort_formats(formats)

        return {
            'id': video_id,
            'uploader': video_uploader,
            'title': video_title,
            'thumbnail': thumbnail,
            'view_count': view_count,
            'like_count': like_count,
            'dislike_count': dislike_count,
            'comment_count': comment_count,
            'formats': formats,
            'age_limit': 18,
        }


class PornHubPlaylistIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?pornhub\.com/playlist/(?P<id>\d+)'
    _TESTS = [{
        'url': 'http://www.pornhub.com/playlist/6201671',
        'info_dict': {
            'id': '6201671',
            'title': 'P0p4',
        },
        'playlist_mincount': 35,
    }]

    def _real_extract(self, url):
        playlist_id = self._match_id(url)

        webpage = self._download_webpage(url, playlist_id)

        entries = [
            self.url_result('http://www.pornhub.com/%s' % video_url, 'PornHub')
            for video_url in set(re.findall('href="/?(view_video\.php\?viewkey=\d+[^"]*)"', webpage))
        ]

        playlist = self._parse_json(
            self._search_regex(
                r'playlistObject\s*=\s*({.+?});', webpage, 'playlist'),
            playlist_id)

        return self.playlist_result(
            entries, playlist_id, playlist.get('title'), playlist.get('description'))
Commit	Line	Data
9933b574 PH	1	from __future__ import unicode_literals
9933b574 PH	2
125cfd78	3	import os
	4	import re
	5
	6	from .common import InfoExtractor
1cc79574 PH	7	from ..compat import (
1cc79574 PH	8	compat_urllib_parse,
125cfd78	9	compat_urllib_parse_urlparse,
125cfd78	10	compat_urllib_request,
1cc79574 PH	11	)
1cc79574 PH	12	from ..utils import (
50789175	13	ExtractorError,
0320ddc1	14	str_to_int,
125cfd78	15	)
	16	from ..aes import (
	17	aes_decrypt_text
	18	)
	19
9933b574	20
125cfd78	21	class PornHubIE(InfoExtractor):
1cc79574	22	_VALID_URL = r'https?://(?:www\.)?pornhub\.com/view_video\.php\?viewkey=(?P<id>[0-9a-f]+)'
125cfd78	23	_TEST = {
9933b574	24	'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
9933b574 PH	25	'md5': '882f488fa1f0026f023f33576004a2ed',
9933b574 PH	26	'info_dict': {
249efaf4 PH	27	'id': '648719015',
	28	'ext': 'mp4',
	29	"uploader": "Babes",
9933b574 PH	30	"title": "Seductive Indian beauty strips down and fingers her pink pussy",
9933b574 PH	31	"age_limit": 18
125cfd78	32	}
	33	}
	34
0320ddc1	35	def _extract_count(self, pattern, webpage, name):
7700207e S	36	return str_to_int(self._search_regex(
7700207e S	37	pattern, webpage, '%s count' % name, fatal=False))
0320ddc1	38
125cfd78	39	def _real_extract(self, url):
249efaf4	40	video_id = self._match_id(url)
125cfd78	41
	42	req = compat_urllib_request.Request(url)
	43	req.add_header('Cookie', 'age_verified=1')
	44	webpage = self._download_webpage(req, video_id)
	45
50789175 PH	46	error_msg = self._html_search_regex(
	47	r'(?s)<div class="userMessageSection[^"]".?>(.*?)</div>',
	48	webpage, 'error message', default=None)
	49	if error_msg:
	50	error_msg = re.sub(r'\s+', ' ', error_msg)
	51	raise ExtractorError(
	52	'PornHub said: %s' % error_msg,
	53	expected=True, video_id=video_id)
	54
9933b574	55	video_title = self._html_search_regex(r'<h1 [^>]+>([^<]+)', webpage, 'title')
0320ddc1	56	video_uploader = self._html_search_regex(
8fc642eb	57	r'(?s)From: .+?<(?:a href="/users/\|a href="/channels/\|span class="username)[^>]+>(.+?)<',
0320ddc1	58	webpage, 'uploader', fatal=False)
9933b574	59	thumbnail = self._html_search_regex(r'"image_url":"([^"]+)', webpage, 'thumbnail', fatal=False)
125cfd78	60	if thumbnail:
	61	thumbnail = compat_urllib_parse.unquote(thumbnail)
	62
7700207e S	63	view_count = self._extract_count(
	64	r'<span class="count">([\d,\.]+)</span> views', webpage, 'view')
	65	like_count = self._extract_count(
	66	r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like')
	67	dislike_count = self._extract_count(
	68	r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike')
0320ddc1	69	comment_count = self._extract_count(
7700207e	70	r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')
0320ddc1	71
5f6a1245	72	video_urls = list(map(compat_urllib_parse.unquote, re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage)))
125cfd78	73	if webpage.find('"encrypted":true') != -1:
ee95c093	74	password = compat_urllib_parse.unquote_plus(self._html_search_regex(r'"video_title":"([^"]+)', webpage, 'password'))
125cfd78	75	video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls))
	76
	77	formats = []
	78	for video_url in video_urls:
a56f9de1 JMF	79	path = compat_urllib_parse_urlparse(video_url).path
a56f9de1 JMF	80	extension = os.path.splitext(path)[1][1:]
125cfd78	81	format = path.split('/')[5].split('_')[:2]
a56f9de1	82	format = "-".join(format)
9933b574 PH	83
	84	m = re.match(r'^(?P<height>[0-9]+)P-(?P<tbr>[0-9]+)K$', format)
	85	if m is None:
	86	height = None
	87	tbr = None
	88	else:
	89	height = int(m.group('height'))
	90	tbr = int(m.group('tbr'))
	91
125cfd78	92	formats.append({
	93	'url': video_url,
	94	'ext': extension,
	95	'format': format,
	96	'format_id': format,
9933b574 PH	97	'tbr': tbr,
9933b574 PH	98	'height': height,
125cfd78	99	})
9933b574	100	self._sort_formats(formats)
125cfd78	101
	102	return {
	103	'id': video_id,
	104	'uploader': video_uploader,
	105	'title': video_title,
	106	'thumbnail': thumbnail,
0320ddc1 S	107	'view_count': view_count,
	108	'like_count': like_count,
	109	'dislike_count': dislike_count,
	110	'comment_count': comment_count,
125cfd78	111	'formats': formats,
750e9833	112	'age_limit': 18,
125cfd78	113	}
e66e1a00 S	114
	115
	116	class PornHubPlaylistIE(InfoExtractor):
	117	_VALID_URL = r'https?://(?:www\.)?pornhub\.com/playlist/(?P<id>\d+)'
	118	_TESTS = [{
	119	'url': 'http://www.pornhub.com/playlist/6201671',
	120	'info_dict': {
	121	'id': '6201671',
	122	'title': 'P0p4',
	123	},
	124	'playlist_mincount': 35,
	125	}]
	126
	127	def _real_extract(self, url):
	128	playlist_id = self._match_id(url)
	129
	130	webpage = self._download_webpage(url, playlist_id)
	131
	132	entries = [
	133	self.url_result('http://www.pornhub.com/%s' % video_url, 'PornHub')
	134	for video_url in set(re.findall('href="/?(view_video\.php\?viewkey=\d+[^"]*)"', webpage))
	135	]
	136
	137	playlist = self._parse_json(
	138	self._search_regex(
	139	r'playlistObject\s=\s({.+?});', webpage, 'playlist'),
	140	playlist_id)
	141
	142	return self.playlist_result(
	143	entries, playlist_id, playlist.get('title'), playlist.get('description'))