[yt-dlp.git] / youtube_dl / extractor / redtube.py

from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
    ExtractorError,
    int_or_none,
    str_to_int,
    unified_strdate,
)


class RedTubeIE(InfoExtractor):
    _VALID_URL = r'https?://(?:(?:www\.)?redtube\.com/|embed\.redtube\.com/\?.*?\bid=)(?P<id>[0-9]+)'
    _TESTS = [{
        'url': 'http://www.redtube.com/66418',
        'md5': 'fc08071233725f26b8f014dba9590005',
        'info_dict': {
            'id': '66418',
            'ext': 'mp4',
            'title': 'Sucked on a toilet',
            'upload_date': '20110811',
            'duration': 596,
            'view_count': int,
            'age_limit': 18,
        }
    }, {
        'url': 'http://embed.redtube.com/?bgcolor=000000&id=1443286',
        'only_matching': True,
    }]

    @staticmethod
    def _extract_urls(webpage):
        return re.findall(
            r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//embed\.redtube\.com/\?.*?\bid=\d+)',
            webpage)

    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(
            'http://www.redtube.com/%s' % video_id, video_id)

        if any(s in webpage for s in ['video-deleted-info', '>This video has been removed']):
            raise ExtractorError('Video %s has been removed' % video_id, expected=True)

        title = self._html_search_regex(
            (r'<h(\d)[^>]+class="(?:video_title_text|videoTitle)[^"]*">(?P<title>(?:(?!\1).)+)</h\1>',
             r'(?:videoTitle|title)\s*:\s*(["\'])(?P<title>(?:(?!\1).)+)\1',),
            webpage, 'title', group='title',
            default=None) or self._og_search_title(webpage)

        formats = []
        sources = self._parse_json(
            self._search_regex(
                r'sources\s*:\s*({.+?})', webpage, 'source', default='{}'),
            video_id, fatal=False)
        if sources and isinstance(sources, dict):
            for format_id, format_url in sources.items():
                if format_url:
                    formats.append({
                        'url': format_url,
                        'format_id': format_id,
                        'height': int_or_none(format_id),
                    })
        medias = self._parse_json(
            self._search_regex(
                r'mediaDefinition\s*:\s*(\[.+?\])', webpage,
                'media definitions', default='{}'),
            video_id, fatal=False)
        if medias and isinstance(medias, list):
            for media in medias:
                format_url = media.get('videoUrl')
                if not format_url or not isinstance(format_url, compat_str):
                    continue
                format_id = media.get('quality')
                formats.append({
                    'url': format_url,
                    'format_id': format_id,
                    'height': int_or_none(format_id),
                })
        if not formats:
            video_url = self._html_search_regex(
                r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL')
            formats.append({'url': video_url})
        self._sort_formats(formats)

        thumbnail = self._og_search_thumbnail(webpage)
        upload_date = unified_strdate(self._search_regex(
            r'<span[^>]+>ADDED ([^<]+)<',
            webpage, 'upload date', fatal=False))
        duration = int_or_none(self._og_search_property(
            'video:duration', webpage, default=None) or self._search_regex(
                r'videoDuration\s*:\s*(\d+)', webpage, 'duration', default=None))
        view_count = str_to_int(self._search_regex(
            (r'<div[^>]*>Views</div>\s*<div[^>]*>\s*([\d,.]+)',
             r'<span[^>]*>VIEWS</span>\s*</td>\s*<td>\s*([\d,.]+)'),
            webpage, 'view count', fatal=False))

        # No self-labeling, but they describe themselves as
        # "Home of Videos Porno"
        age_limit = 18

        return {
            'id': video_id,
            'ext': 'mp4',
            'title': title,
            'thumbnail': thumbnail,
            'upload_date': upload_date,
            'duration': duration,
            'view_count': view_count,
            'age_limit': age_limit,
            'formats': formats,
        }
Commit	Line	Data
032b3df5 PH	1	from __future__ import unicode_literals
032b3df5 PH	2
e28ed498 S	3	import re
e28ed498 S	4
9f5daf00	5	from .common import InfoExtractor
880fa66f	6	from ..compat import compat_str
ac12e888 S	7	from ..utils import (
	8	ExtractorError,
	9	int_or_none,
	10	str_to_int,
	11	unified_strdate,
	12	)
9f5daf00 PH	13
	14
	15	class RedTubeIE(InfoExtractor):
5021ca6c S	16	_VALID_URL = r'https?://(?:(?:www\.)?redtube\.com/\|embed\.redtube\.com/\?.*?\bid=)(?P<id>[0-9]+)'
5021ca6c S	17	_TESTS = [{
032b3df5	18	'url': 'http://www.redtube.com/66418',
18ebd1a8	19	'md5': 'fc08071233725f26b8f014dba9590005',
032b3df5	20	'info_dict': {
faf34948 PH	21	'id': '66418',
faf34948 PH	22	'ext': 'mp4',
838b9340	23	'title': 'Sucked on a toilet',
18ebd1a8	24	'upload_date': '20110811',
ac12e888 S	25	'duration': 596,
ac12e888 S	26	'view_count': int,
838b9340	27	'age_limit': 18,
6f5ac90c	28	}
5021ca6c S	29	}, {
	30	'url': 'http://embed.redtube.com/?bgcolor=000000&id=1443286',
	31	'only_matching': True,
	32	}]
9f5daf00	33
e28ed498 S	34	@staticmethod
	35	def _extract_urls(webpage):
	36	return re.findall(
	37	r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//embed\.redtube\.com/\?.*?\bid=\d+)',
	38	webpage)
	39
cd214418	40	def _real_extract(self, url):
faf34948	41	video_id = self._match_id(url)
5021ca6c S	42	webpage = self._download_webpage(
5021ca6c S	43	'http://www.redtube.com/%s' % video_id, video_id)
9f5daf00	44
2676caf3 S	45	if any(s in webpage for s in ['video-deleted-info', '>This video has been removed']):
	46	raise ExtractorError('Video %s has been removed' % video_id, expected=True)
	47
ac12e888	48	title = self._html_search_regex(
1367c798 S	49	(r'<h(\d)[^>]+class="(?:video_title_text\|videoTitle)[^"]*">(?P<title>(?:(?!\1).)+)</h\1>',
	50	r'(?:videoTitle\|title)\s:\s(["\'])(?P<title>(?:(?!\1).)+)\1',),
	51	webpage, 'title', group='title',
	52	default=None) or self._og_search_title(webpage)
ac12e888 S	53
	54	formats = []
	55	sources = self._parse_json(
	56	self._search_regex(
	57	r'sources\s:\s({.+?})', webpage, 'source', default='{}'),
	58	video_id, fatal=False)
	59	if sources and isinstance(sources, dict):
	60	for format_id, format_url in sources.items():
	61	if format_url:
	62	formats.append({
	63	'url': format_url,
	64	'format_id': format_id,
	65	'height': int_or_none(format_id),
	66	})
880fa66f S	67	medias = self._parse_json(
	68	self._search_regex(
	69	r'mediaDefinition\s:\s(\[.+?\])', webpage,
	70	'media definitions', default='{}'),
	71	video_id, fatal=False)
	72	if medias and isinstance(medias, list):
	73	for media in medias:
	74	format_url = media.get('videoUrl')
	75	if not format_url or not isinstance(format_url, compat_str):
	76	continue
	77	format_id = media.get('quality')
	78	formats.append({
	79	'url': format_url,
	80	'format_id': format_id,
	81	'height': int_or_none(format_id),
	82	})
	83	if not formats:
ac12e888 S	84	video_url = self._html_search_regex(
	85	r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL')
	86	formats.append({'url': video_url})
	87	self._sort_formats(formats)
	88
	89	thumbnail = self._og_search_thumbnail(webpage)
	90	upload_date = unified_strdate(self._search_regex(
1367c798	91	r'<span[^>]+>ADDED ([^<]+)<',
ac12e888	92	webpage, 'upload date', fatal=False))
18ebd1a8 W	93	duration = int_or_none(self._og_search_property(
	94	'video:duration', webpage, default=None) or self._search_regex(
	95	r'videoDuration\s:\s(\d+)', webpage, 'duration', default=None))
ac12e888	96	view_count = str_to_int(self._search_regex(
1367c798 S	97	(r'<div[^>]>Views</div>\s<div[^>]>\s([\d,.]+)',
1367c798 S	98	r'<span[^>]>VIEWS</span>\s</td>\s<td>\s([\d,.]+)'),
ac12e888 S	99	webpage, 'view count', fatal=False))
ac12e888 S	100
1310bf24 PH	101	# No self-labeling, but they describe themselves as
	102	# "Home of Videos Porno"
	103	age_limit = 18
	104
cd214418	105	return {
032b3df5	106	'id': video_id,
faf34948	107	'ext': 'mp4',
ac12e888 S	108	'title': title,
	109	'thumbnail': thumbnail,
	110	'upload_date': upload_date,
	111	'duration': duration,
	112	'view_count': view_count,
1310bf24	113	'age_limit': age_limit,
ac12e888	114	'formats': formats,
cd214418	115	}