[yt-dlp.git] / youtube_dl / extractor / redtube.py

from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
    ExtractorError,
    int_or_none,
    str_to_int,
    unified_strdate,
)


class RedTubeIE(InfoExtractor):
    _VALID_URL = r'https?://(?:(?:www\.)?redtube\.com/|embed\.redtube\.com/\?.*?\bid=)(?P<id>[0-9]+)'
    _TESTS = [{
        'url': 'http://www.redtube.com/66418',
        'md5': '7b8c22b5e7098a3e1c09709df1126d2d',
        'info_dict': {
            'id': '66418',
            'ext': 'mp4',
            'title': 'Sucked on a toilet',
            'upload_date': '20120831',
            'duration': 596,
            'view_count': int,
            'age_limit': 18,
        }
    }, {
        'url': 'http://embed.redtube.com/?bgcolor=000000&id=1443286',
        'only_matching': True,
    }]

    @staticmethod
    def _extract_urls(webpage):
        return re.findall(
            r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//embed\.redtube\.com/\?.*?\bid=\d+)',
            webpage)

    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(
            'http://www.redtube.com/%s' % video_id, video_id)

        if any(s in webpage for s in ['video-deleted-info', '>This video has been removed']):
            raise ExtractorError('Video %s has been removed' % video_id, expected=True)

        title = self._html_search_regex(
            (r'<h1 class="videoTitle[^"]*">(?P<title>.+?)</h1>',
             r'videoTitle\s*:\s*(["\'])(?P<title>)\1'),
            webpage, 'title', group='title')

        formats = []
        sources = self._parse_json(
            self._search_regex(
                r'sources\s*:\s*({.+?})', webpage, 'source', default='{}'),
            video_id, fatal=False)
        if sources and isinstance(sources, dict):
            for format_id, format_url in sources.items():
                if format_url:
                    formats.append({
                        'url': format_url,
                        'format_id': format_id,
                        'height': int_or_none(format_id),
                    })
        medias = self._parse_json(
            self._search_regex(
                r'mediaDefinition\s*:\s*(\[.+?\])', webpage,
                'media definitions', default='{}'),
            video_id, fatal=False)
        if medias and isinstance(medias, list):
            for media in medias:
                format_url = media.get('videoUrl')
                if not format_url or not isinstance(format_url, compat_str):
                    continue
                format_id = media.get('quality')
                formats.append({
                    'url': format_url,
                    'format_id': format_id,
                    'height': int_or_none(format_id),
                })
        if not formats:
            video_url = self._html_search_regex(
                r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL')
            formats.append({'url': video_url})
        self._sort_formats(formats)

        thumbnail = self._og_search_thumbnail(webpage)
        upload_date = unified_strdate(self._search_regex(
            r'<span[^>]+class="added-time"[^>]*>ADDED ([^<]+)<',
            webpage, 'upload date', fatal=False))
        duration = int_or_none(self._search_regex(
            r'videoDuration\s*:\s*(\d+)', webpage, 'duration', default=None))
        view_count = str_to_int(self._search_regex(
            r'<span[^>]*>VIEWS</span></td>\s*<td>([\d,.]+)',
            webpage, 'view count', fatal=False))

        # No self-labeling, but they describe themselves as
        # "Home of Videos Porno"
        age_limit = 18

        return {
            'id': video_id,
            'ext': 'mp4',
            'title': title,
            'thumbnail': thumbnail,
            'upload_date': upload_date,
            'duration': duration,
            'view_count': view_count,
            'age_limit': age_limit,
            'formats': formats,
        }
Commit	Line	Data
032b3df5 PH	1	from __future__ import unicode_literals
032b3df5 PH	2
e28ed498 S	3	import re
e28ed498 S	4
9f5daf00	5	from .common import InfoExtractor
880fa66f	6	from ..compat import compat_str
ac12e888 S	7	from ..utils import (
	8	ExtractorError,
	9	int_or_none,
	10	str_to_int,
	11	unified_strdate,
	12	)
9f5daf00 PH	13
	14
	15	class RedTubeIE(InfoExtractor):
5021ca6c S	16	_VALID_URL = r'https?://(?:(?:www\.)?redtube\.com/\|embed\.redtube\.com/\?.*?\bid=)(?P<id>[0-9]+)'
5021ca6c S	17	_TESTS = [{
032b3df5	18	'url': 'http://www.redtube.com/66418',
838b9340	19	'md5': '7b8c22b5e7098a3e1c09709df1126d2d',
032b3df5	20	'info_dict': {
faf34948 PH	21	'id': '66418',
faf34948 PH	22	'ext': 'mp4',
838b9340	23	'title': 'Sucked on a toilet',
ac12e888 S	24	'upload_date': '20120831',
	25	'duration': 596,
	26	'view_count': int,
838b9340	27	'age_limit': 18,
6f5ac90c	28	}
5021ca6c S	29	}, {
	30	'url': 'http://embed.redtube.com/?bgcolor=000000&id=1443286',
	31	'only_matching': True,
	32	}]
9f5daf00	33
e28ed498 S	34	@staticmethod
	35	def _extract_urls(webpage):
	36	return re.findall(
	37	r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//embed\.redtube\.com/\?.*?\bid=\d+)',
	38	webpage)
	39
cd214418	40	def _real_extract(self, url):
faf34948	41	video_id = self._match_id(url)
5021ca6c S	42	webpage = self._download_webpage(
5021ca6c S	43	'http://www.redtube.com/%s' % video_id, video_id)
9f5daf00	44
2676caf3 S	45	if any(s in webpage for s in ['video-deleted-info', '>This video has been removed']):
	46	raise ExtractorError('Video %s has been removed' % video_id, expected=True)
	47
ac12e888 S	48	title = self._html_search_regex(
	49	(r'<h1 class="videoTitle[^"]*">(?P<title>.+?)</h1>',
	50	r'videoTitle\s:\s(["\'])(?P<title>)\1'),
	51	webpage, 'title', group='title')
	52
	53	formats = []
	54	sources = self._parse_json(
	55	self._search_regex(
	56	r'sources\s:\s({.+?})', webpage, 'source', default='{}'),
	57	video_id, fatal=False)
	58	if sources and isinstance(sources, dict):
	59	for format_id, format_url in sources.items():
	60	if format_url:
	61	formats.append({
	62	'url': format_url,
	63	'format_id': format_id,
	64	'height': int_or_none(format_id),
	65	})
880fa66f S	66	medias = self._parse_json(
	67	self._search_regex(
	68	r'mediaDefinition\s:\s(\[.+?\])', webpage,
	69	'media definitions', default='{}'),
	70	video_id, fatal=False)
	71	if medias and isinstance(medias, list):
	72	for media in medias:
	73	format_url = media.get('videoUrl')
	74	if not format_url or not isinstance(format_url, compat_str):
	75	continue
	76	format_id = media.get('quality')
	77	formats.append({
	78	'url': format_url,
	79	'format_id': format_id,
	80	'height': int_or_none(format_id),
	81	})
	82	if not formats:
ac12e888 S	83	video_url = self._html_search_regex(
	84	r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL')
	85	formats.append({'url': video_url})
	86	self._sort_formats(formats)
	87
	88	thumbnail = self._og_search_thumbnail(webpage)
	89	upload_date = unified_strdate(self._search_regex(
	90	r'<span[^>]+class="added-time"[^>]*>ADDED ([^<]+)<',
	91	webpage, 'upload date', fatal=False))
	92	duration = int_or_none(self._search_regex(
880fa66f	93	r'videoDuration\s:\s(\d+)', webpage, 'duration', default=None))
ac12e888 S	94	view_count = str_to_int(self._search_regex(
	95	r'<span[^>]>VIEWS</span></td>\s<td>([\d,.]+)',
	96	webpage, 'view count', fatal=False))
	97
1310bf24 PH	98	# No self-labeling, but they describe themselves as
	99	# "Home of Videos Porno"
	100	age_limit = 18
	101
cd214418	102	return {
032b3df5	103	'id': video_id,
faf34948	104	'ext': 'mp4',
ac12e888 S	105	'title': title,
	106	'thumbnail': thumbnail,
	107	'upload_date': upload_date,
	108	'duration': duration,
	109	'view_count': view_count,
1310bf24	110	'age_limit': age_limit,
ac12e888	111	'formats': formats,
cd214418	112	}