[yt-dlp.git] / youtube_dl / extractor / fourtube.py

from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..utils import (
    compat_urllib_request,
    unified_strdate,
    str_to_int,
    parse_duration,
    clean_html,
)


class FourTubeIE(InfoExtractor):
    IE_NAME = '4tube'
    _VALID_URL = r'https?://(?:www\.)?4tube\.com/videos/(?P<id>\d+)'

    _TEST = {
        'url': 'http://www.4tube.com/videos/209733/hot-babe-holly-michaels-gets-her-ass-stuffed-by-black',
        'md5': '6516c8ac63b03de06bc8eac14362db4f',
        'info_dict': {
            'id': '209733',
            'ext': 'mp4',
            'title': 'Hot Babe Holly Michaels gets her ass stuffed by black',
            'uploader': 'WCP Club',
            'uploader_id': 'wcp-club',
            'upload_date': '20131031',
            'duration': 583,
        }
    }

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)

        video_id = mobj.group('id')
        webpage_url = 'http://www.4tube.com/videos/' + video_id
        webpage = self._download_webpage(webpage_url, video_id)

        self.report_extraction(video_id)

        playlist_json = self._html_search_regex(r'var playerConfigPlaylist\s+=\s+([^;]+)', webpage, 'Playlist')
        media_id = self._search_regex(r'idMedia:\s*(\d+)', playlist_json, 'Media Id')
        sources = self._search_regex(r'sources:\s*\[([^\]]*)\]', playlist_json, 'Sources').split(',')
        title = self._search_regex(r'title:\s*"([^"]*)', playlist_json, 'Title')
        thumbnail_url = self._search_regex(r'image:\s*"([^"]*)', playlist_json, 'Thumbnail', fatal=False)

        uploader_str = self._search_regex(r'<span>Uploaded by</span>(.*?)<span>', webpage, 'uploader', fatal=False)
        mobj = re.search(r'<a href="/sites/(?P<id>[^"]+)"><strong>(?P<name>[^<]+)</strong></a>', uploader_str)
        (uploader, uploader_id) = (mobj.group('name'), mobj.group('id')) if mobj else (clean_html(uploader_str), None)

        upload_date = None
        view_count = None
        duration = None
        description = self._html_search_meta('description', webpage, 'description')
        if description:
            upload_date = self._search_regex(r'Published Date: (\d{2} [a-zA-Z]{3} \d{4})', description, 'upload date',
                fatal=False)
            if upload_date:
                upload_date = unified_strdate(upload_date)
            view_count = self._search_regex(r'Views: ([\d,\.]+)', description, 'view count', fatal=False)
            if view_count:
                view_count = str_to_int(view_count)
            duration = parse_duration(self._search_regex(r'Length: (\d+m\d+s)', description, 'duration', fatal=False))

        token_url = "http://tkn.4tube.com/{0}/desktop/{1}".format(media_id, "+".join(sources))
        headers = {
                b'Content-Type': b'application/x-www-form-urlencoded',
                b'Origin': b'http://www.4tube.com',
                }
        token_req = compat_urllib_request.Request(token_url, b'{}', headers)
        tokens = self._download_json(token_req, video_id)

        formats = [{
            'url': tokens[format]['token'],
            'format_id': format + 'p',
            'resolution': format + 'p',
            'quality': int(format),
            } for format in sources]

        self._sort_formats(formats)

        return {
            'id': video_id,
            'title': title,
            'formats': formats,
            'thumbnail': thumbnail_url,
            'uploader': uploader,
            'uploader_id': uploader_id,
            'upload_date': upload_date,
            'view_count': view_count,
            'duration': duration,
            'age_limit': 18,
            'webpage_url': webpage_url,
        }
Commit	Line	Data
ae6cae78 S	1	from __future__ import unicode_literals
ae6cae78 S	2
03635e2a MK	3	import re
	4
	5	from .common import InfoExtractor
ae6cae78 S	6	from ..utils import (
	7	compat_urllib_request,
	8	unified_strdate,
	9	str_to_int,
	10	parse_duration,
e77c5b4f	11	clean_html,
ae6cae78	12	)
ae6cae78	13
03635e2a MK	14
	15	class FourTubeIE(InfoExtractor):
	16	IE_NAME = '4tube'
ae6cae78	17	_VALID_URL = r'https?://(?:www\.)?4tube\.com/videos/(?P<id>\d+)'
03635e2a MK	18
03635e2a MK	19	_TEST = {
ae6cae78 S	20	'url': 'http://www.4tube.com/videos/209733/hot-babe-holly-michaels-gets-her-ass-stuffed-by-black',
	21	'md5': '6516c8ac63b03de06bc8eac14362db4f',
	22	'info_dict': {
	23	'id': '209733',
	24	'ext': 'mp4',
	25	'title': 'Hot Babe Holly Michaels gets her ass stuffed by black',
	26	'uploader': 'WCP Club',
	27	'uploader_id': 'wcp-club',
	28	'upload_date': '20131031',
	29	'duration': 583,
	30	}
	31	}
03635e2a MK	32
	33	def _real_extract(self, url):
	34	mobj = re.match(self._VALID_URL, url)
	35
	36	video_id = mobj.group('id')
	37	webpage_url = 'http://www.4tube.com/videos/' + video_id
	38	webpage = self._download_webpage(webpage_url, video_id)
	39
	40	self.report_extraction(video_id)
	41
ae6cae78 S	42	playlist_json = self._html_search_regex(r'var playerConfigPlaylist\s+=\s+([^;]+)', webpage, 'Playlist')
	43	media_id = self._search_regex(r'idMedia:\s*(\d+)', playlist_json, 'Media Id')
	44	sources = self._search_regex(r'sources:\s\[([^\]])\]', playlist_json, 'Sources').split(',')
	45	title = self._search_regex(r'title:\s"([^"])', playlist_json, 'Title')
	46	thumbnail_url = self._search_regex(r'image:\s"([^"])', playlist_json, 'Thumbnail', fatal=False)
	47
	48	uploader_str = self._search_regex(r'<span>Uploaded by</span>(.*?)<span>', webpage, 'uploader', fatal=False)
	49	mobj = re.search(r'<a href="/sites/(?P<id>[^"]+)"><strong>(?P<name>[^<]+)</strong></a>', uploader_str)
	50	(uploader, uploader_id) = (mobj.group('name'), mobj.group('id')) if mobj else (clean_html(uploader_str), None)
	51
	52	upload_date = None
	53	view_count = None
	54	duration = None
	55	description = self._html_search_meta('description', webpage, 'description')
	56	if description:
	57	upload_date = self._search_regex(r'Published Date: (\d{2} [a-zA-Z]{3} \d{4})', description, 'upload date',
	58	fatal=False)
	59	if upload_date:
	60	upload_date = unified_strdate(upload_date)
	61	view_count = self._search_regex(r'Views: ([\d,\.]+)', description, 'view count', fatal=False)
	62	if view_count:
	63	view_count = str_to_int(view_count)
	64	duration = parse_duration(self._search_regex(r'Length: (\d+m\d+s)', description, 'duration', fatal=False))
03635e2a MK	65
	66	token_url = "http://tkn.4tube.com/{0}/desktop/{1}".format(media_id, "+".join(sources))
	67	headers = {
	68	b'Content-Type': b'application/x-www-form-urlencoded',
	69	b'Origin': b'http://www.4tube.com',
	70	}
	71	token_req = compat_urllib_request.Request(token_url, b'{}', headers)
	72	tokens = self._download_json(token_req, video_id)
	73
	74	formats = [{
	75	'url': tokens[format]['token'],
	76	'format_id': format + 'p',
	77	'resolution': format + 'p',
	78	'quality': int(format),
	79	} for format in sources]
	80
ae6cae78 S	81	self._sort_formats(formats)
	82
	83	return {
03635e2a MK	84	'id': video_id,
	85	'title': title,
	86	'formats': formats,
	87	'thumbnail': thumbnail_url,
ae6cae78 S	88	'uploader': uploader,
	89	'uploader_id': uploader_id,
	90	'upload_date': upload_date,
	91	'view_count': view_count,
	92	'duration': duration,
03635e2a MK	93	'age_limit': 18,
03635e2a MK	94	'webpage_url': webpage_url,
ae6cae78	95	}