[yt-dlp.git] / youtube_dl / extractor / xtube.py

from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..compat import compat_urllib_parse_unquote
from ..utils import (
    parse_duration,
    sanitized_Request,
    str_to_int,
)


class XTubeIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?(?P<url>xtube\.com/watch\.php\?v=(?P<id>[^/?&#]+))'
    _TEST = {
        'url': 'http://www.xtube.com/watch.php?v=kVTUy_G222_',
        'md5': '092fbdd3cbe292c920ef6fc6a8a9cdab',
        'info_dict': {
            'id': 'kVTUy_G222_',
            'ext': 'mp4',
            'title': 'strange erotica',
            'description': 'contains:an ET kind of thing',
            'uploader': 'greenshowers',
            'duration': 450,
            'age_limit': 18,
        }
    }

    def _real_extract(self, url):
        video_id = self._match_id(url)

        req = sanitized_Request(url)
        req.add_header('Cookie', 'age_verified=1')
        webpage = self._download_webpage(req, video_id)

        video_title = self._html_search_regex(
            r'<p class="title">([^<]+)', webpage, 'title')
        video_uploader = self._html_search_regex(
            [r"var\s+contentOwnerId\s*=\s*'([^']+)",
             r'By:\s*<a href="/community/profile\.php\?user=([^"]+)'],
            webpage, 'uploader', fatal=False)
        video_description = self._html_search_regex(
            r'<p class="fieldsDesc">([^<]+)',
            webpage, 'description', fatal=False)
        duration = parse_duration(self._html_search_regex(
            r'<span class="bold">Runtime:</span> ([^<]+)</p>',
            webpage, 'duration', fatal=False))
        view_count = str_to_int(self._html_search_regex(
            r'<span class="bold">Views:</span> ([\d,\.]+)</p>',
            webpage, 'view count', fatal=False))
        comment_count = str_to_int(self._html_search_regex(
            r'<div id="commentBar">([\d,\.]+) Comments</div>',
            webpage, 'comment count', fatal=False))

        formats = []
        for format_id, video_url in re.findall(
                r'flashvars\.quality_(.+?)\s*=\s*"([^"]+)"', webpage):
            fmt = {
                'url': compat_urllib_parse_unquote(video_url),
                'format_id': format_id,
            }
            m = re.search(r'^(?P<height>\d+)[pP]', format_id)
            if m:
                fmt['height'] = int(m.group('height'))
            formats.append(fmt)

        if not formats:
            video_url = compat_urllib_parse_unquote(self._search_regex(
                r'flashvars\.video_url\s*=\s*"([^"]+)"',
                webpage, 'video URL'))
            formats.append({'url': video_url})

        self._sort_formats(formats)

        return {
            'id': video_id,
            'title': video_title,
            'uploader': video_uploader,
            'description': video_description,
            'duration': duration,
            'view_count': view_count,
            'comment_count': comment_count,
            'formats': formats,
            'age_limit': 18,
        }


class XTubeUserIE(InfoExtractor):
    IE_DESC = 'XTube user profile'
    _VALID_URL = r'https?://(?:www\.)?xtube\.com/community/profile\.php\?(.*?)user=(?P<username>[^&#]+)(?:$|[&#])'
    _TEST = {
        'url': 'http://www.xtube.com/community/profile.php?user=greenshowers',
        'info_dict': {
            'id': 'greenshowers',
            'age_limit': 18,
        },
        'playlist_mincount': 155,
    }

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        username = mobj.group('username')

        profile_page = self._download_webpage(
            url, username, note='Retrieving profile page')

        video_count = int(self._search_regex(
            r'<strong>%s\'s Videos \(([0-9]+)\)</strong>' % username, profile_page,
            'video count'))

        PAGE_SIZE = 25
        urls = []
        page_count = (video_count + PAGE_SIZE + 1) // PAGE_SIZE
        for n in range(1, page_count + 1):
            lpage_url = 'http://www.xtube.com/user_videos.php?page=%d&u=%s' % (n, username)
            lpage = self._download_webpage(
                lpage_url, username,
                note='Downloading page %d/%d' % (n, page_count))
            urls.extend(
                re.findall(r'addthis:url="([^"]+)"', lpage))

        return {
            '_type': 'playlist',
            'id': username,
            'age_limit': 18,
            'entries': [{
                '_type': 'url',
                'url': eurl,
                'ie_key': 'XTube',
            } for eurl in urls]
        }
Commit	Line	Data
c5ba203e AS	1	from __future__ import unicode_literals
c5ba203e AS	2
dcc2a706	3	import re
	4
	5	from .common import InfoExtractor
5c2266df	6	from ..compat import compat_urllib_parse_unquote
1cc79574	7	from ..utils import (
607dbbad	8	parse_duration,
5c2266df	9	sanitized_Request,
607dbbad	10	str_to_int,
dcc2a706	11	)
dcc2a706	12
607dbbad	13
dcc2a706	14	class XTubeIE(InfoExtractor):
16ea8179	15	_VALID_URL = r'https?://(?:www\.)?(?P<url>xtube\.com/watch\.php\?v=(?P<id>[^/?&#]+))'
dcc2a706	16	_TEST = {
c5ba203e	17	'url': 'http://www.xtube.com/watch.php?v=kVTUy_G222_',
c5ba203e AS	18	'md5': '092fbdd3cbe292c920ef6fc6a8a9cdab',
c5ba203e AS	19	'info_dict': {
607dbbad S	20	'id': 'kVTUy_G222_',
	21	'ext': 'mp4',
	22	'title': 'strange erotica',
9789d753	23	'description': 'contains:an ET kind of thing',
607dbbad S	24	'uploader': 'greenshowers',
	25	'duration': 450,
	26	'age_limit': 18,
dcc2a706	27	}
	28	}
	29
	30	def _real_extract(self, url):
16ea8179	31	video_id = self._match_id(url)
dcc2a706	32
5c2266df	33	req = sanitized_Request(url)
dcc2a706	34	req.add_header('Cookie', 'age_verified=1')
	35	webpage = self._download_webpage(req, video_id)
	36
16ea8179 S	37	video_title = self._html_search_regex(
16ea8179 S	38	r'<p class="title">([^<]+)', webpage, 'title')
607dbbad	39	video_uploader = self._html_search_regex(
16ea8179	40	[r"var\s+contentOwnerId\s=\s'([^']+)",
324b2c78	41	r'By:\s*<a href="/community/profile\.php\?user=([^"]+)'],
16ea8179	42	webpage, 'uploader', fatal=False)
607dbbad	43	video_description = self._html_search_regex(
16ea8179 S	44	r'<p class="fieldsDesc">([^<]+)',
16ea8179 S	45	webpage, 'description', fatal=False)
607dbbad	46	duration = parse_duration(self._html_search_regex(
16ea8179 S	47	r'<span class="bold">Runtime:</span> ([^<]+)</p>',
	48	webpage, 'duration', fatal=False))
	49	view_count = str_to_int(self._html_search_regex(
	50	r'<span class="bold">Views:</span> ([\d,\.]+)</p>',
	51	webpage, 'view count', fatal=False))
	52	comment_count = str_to_int(self._html_search_regex(
	53	r'<div id="commentBar">([\d,\.]+) Comments</div>',
	54	webpage, 'comment count', fatal=False))
aa488e13	55
16ea8179 S	56	formats = []
	57	for format_id, video_url in re.findall(
	58	r'flashvars\.quality_(.+?)\s=\s"([^"]+)"', webpage):
	59	fmt = {
ee8de13e	60	'url': compat_urllib_parse_unquote(video_url),
aa488e13	61	'format_id': format_id,
16ea8179 S	62	}
	63	m = re.search(r'^(?P<height>\d+)[pP]', format_id)
	64	if m:
	65	fmt['height'] = int(m.group('height'))
	66	formats.append(fmt)
	67
	68	if not formats:
ee8de13e	69	video_url = compat_urllib_parse_unquote(self._search_regex(
16ea8179 S	70	r'flashvars\.video_url\s=\s"([^"]+)"',
	71	webpage, 'video URL'))
	72	formats.append({'url': video_url})
	73
aa488e13	74	self._sort_formats(formats)
dcc2a706	75
	76	return {
	77	'id': video_id,
	78	'title': video_title,
	79	'uploader': video_uploader,
	80	'description': video_description,
607dbbad S	81	'duration': duration,
	82	'view_count': view_count,
	83	'comment_count': comment_count,
aa488e13	84	'formats': formats,
dcc2a706	85	'age_limit': 18,
9f5809b3	86	}
9f5809b3	87
22a6f150	88
9f5809b3	89	class XTubeUserIE(InfoExtractor):
	90	IE_DESC = 'XTube user profile'
	91	_VALID_URL = r'https?://(?:www\.)?xtube\.com/community/profile\.php\?(.*?)user=(?P<username>[^&#]+)(?:$\|[&#])'
22a6f150 PH	92	_TEST = {
	93	'url': 'http://www.xtube.com/community/profile.php?user=greenshowers',
	94	'info_dict': {
	95	'id': 'greenshowers',
05900629	96	'age_limit': 18,
22a6f150 PH	97	},
	98	'playlist_mincount': 155,
	99	}
9f5809b3	100
	101	def _real_extract(self, url):
	102	mobj = re.match(self._VALID_URL, url)
	103	username = mobj.group('username')
	104
	105	profile_page = self._download_webpage(
	106	url, username, note='Retrieving profile page')
	107
	108	video_count = int(self._search_regex(
2514d263	109	r'<strong>%s\'s Videos \(([0-9]+)\)</strong>' % username, profile_page,
9f5809b3	110	'video count'))
	111
	112	PAGE_SIZE = 25
	113	urls = []
	114	page_count = (video_count + PAGE_SIZE + 1) // PAGE_SIZE
	115	for n in range(1, page_count + 1):
	116	lpage_url = 'http://www.xtube.com/user_videos.php?page=%d&u=%s' % (n, username)
	117	lpage = self._download_webpage(
	118	lpage_url, username,
	119	note='Downloading page %d/%d' % (n, page_count))
	120	urls.extend(
	121	re.findall(r'addthis:url="([^"]+)"', lpage))
	122
	123	return {
	124	'_type': 'playlist',
	125	'id': username,
05900629	126	'age_limit': 18,
9f5809b3	127	'entries': [{
	128	'_type': 'url',
	129	'url': eurl,
	130	'ie_key': 'XTube',
	131	} for eurl in urls]
	132	}