[yt-dlp.git] / yt_dlp / extractor / spankwire.py

import re

from .common import InfoExtractor
from ..utils import (
    float_or_none,
    int_or_none,
    merge_dicts,
    str_or_none,
    str_to_int,
    url_or_none,
)


class SpankwireIE(InfoExtractor):
    _VALID_URL = r'''(?x)
                    https?://
                        (?:www\.)?spankwire\.com/
                        (?:
                            [^/]+/video|
                            EmbedPlayer\.aspx/?\?.*?\bArticleId=
                        )
                        (?P<id>\d+)
                    '''
    _EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?spankwire\.com/EmbedPlayer\.aspx/?\?.*?\bArticleId=\d+)']
    _TESTS = [{
        # download URL pattern: */<height>P_<tbr>K_<video_id>.mp4
        'url': 'http://www.spankwire.com/Buckcherry-s-X-Rated-Music-Video-Crazy-Bitch/video103545/',
        'md5': '5aa0e4feef20aad82cbcae3aed7ab7cd',
        'info_dict': {
            'id': '103545',
            'ext': 'mp4',
            'title': 'Buckcherry`s X Rated Music Video Crazy Bitch',
            'description': 'Crazy Bitch X rated music video.',
            'duration': 222,
            'uploader': 'oreusz',
            'uploader_id': '124697',
            'timestamp': 1178587885,
            'upload_date': '20070508',
            'average_rating': float,
            'view_count': int,
            'comment_count': int,
            'age_limit': 18,
            'categories': list,
            'tags': list,
        },
    }, {
        # download URL pattern: */mp4_<format_id>_<video_id>.mp4
        'url': 'http://www.spankwire.com/Titcums-Compiloation-I/video1921551/',
        'md5': '09b3c20833308b736ae8902db2f8d7e6',
        'info_dict': {
            'id': '1921551',
            'ext': 'mp4',
            'title': 'Titcums Compiloation I',
            'description': 'cum on tits',
            'uploader': 'dannyh78999',
            'uploader_id': '3056053',
            'upload_date': '20150822',
            'age_limit': 18,
        },
        'params': {
            'proxy': '127.0.0.1:8118'
        },
        'skip': 'removed',
    }, {
        'url': 'https://www.spankwire.com/EmbedPlayer.aspx/?ArticleId=156156&autostart=true',
        'only_matching': True,
    }]

    def _real_extract(self, url):
        video_id = self._match_id(url)

        video = self._download_json(
            'https://www.spankwire.com/api/video/%s.json' % video_id, video_id)

        title = video['title']

        formats = []
        videos = video.get('videos')
        if isinstance(videos, dict):
            for format_id, format_url in videos.items():
                video_url = url_or_none(format_url)
                if not format_url:
                    continue
                height = int_or_none(self._search_regex(
                    r'(\d+)[pP]', format_id, 'height', default=None))
                m = re.search(
                    r'/(?P<height>\d+)[pP]_(?P<tbr>\d+)[kK]', video_url)
                if m:
                    tbr = int(m.group('tbr'))
                    height = height or int(m.group('height'))
                else:
                    tbr = None
                formats.append({
                    'url': video_url,
                    'format_id': '%dp' % height if height else format_id,
                    'height': height,
                    'tbr': tbr,
                })
        m3u8_url = url_or_none(video.get('HLS'))
        if m3u8_url:
            formats.extend(self._extract_m3u8_formats(
                m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native',
                m3u8_id='hls', fatal=False))

        view_count = str_to_int(video.get('viewed'))

        thumbnails = []
        for preference, t in enumerate(('', '2x'), start=0):
            thumbnail_url = url_or_none(video.get('poster%s' % t))
            if not thumbnail_url:
                continue
            thumbnails.append({
                'url': thumbnail_url,
                'preference': preference,
            })

        def extract_names(key):
            entries_list = video.get(key)
            if not isinstance(entries_list, list):
                return
            entries = []
            for entry in entries_list:
                name = str_or_none(entry.get('name'))
                if name:
                    entries.append(name)
            return entries

        categories = extract_names('categories')
        tags = extract_names('tags')

        uploader = None
        info = {}

        webpage = self._download_webpage(
            'https://www.spankwire.com/_/video%s/' % video_id, video_id,
            fatal=False)
        if webpage:
            info = self._search_json_ld(webpage, video_id, default={})
            thumbnail_url = None
            if 'thumbnail' in info:
                thumbnail_url = url_or_none(info['thumbnail'])
                del info['thumbnail']
            if not thumbnail_url:
                thumbnail_url = self._og_search_thumbnail(webpage)
            if thumbnail_url:
                thumbnails.append({
                    'url': thumbnail_url,
                    'preference': 10,
                })
            uploader = self._html_search_regex(
                r'(?s)by\s*<a[^>]+\bclass=["\']uploaded__by[^>]*>(.+?)</a>',
                webpage, 'uploader', fatal=False)
            if not view_count:
                view_count = str_to_int(self._search_regex(
                    r'data-views=["\']([\d,.]+)', webpage, 'view count',
                    fatal=False))

        return merge_dicts({
            'id': video_id,
            'title': title,
            'description': video.get('description'),
            'duration': int_or_none(video.get('duration')),
            'thumbnails': thumbnails,
            'uploader': uploader,
            'uploader_id': str_or_none(video.get('userId')),
            'timestamp': int_or_none(video.get('time_approved_on')),
            'average_rating': float_or_none(video.get('rating')),
            'view_count': view_count,
            'comment_count': int_or_none(video.get('comments')),
            'age_limit': 18,
            'categories': categories,
            'tags': tags,
            'formats': formats,
        }, info)
Commit	Line	Data
7b2212e9	1	import re
	2
	3	from .common import InfoExtractor
1cc79574	4	from ..utils import (
d44a707f S	5	float_or_none,
	6	int_or_none,
	7	merge_dicts,
	8	str_or_none,
9767726b	9	str_to_int,
d44a707f	10	url_or_none,
7b2212e9	11	)
7b2212e9	12
9ac0a675	13
7b2212e9	14	class SpankwireIE(InfoExtractor):
d44a707f S	15	_VALID_URL = r'''(?x)
	16	https?://
	17	(?:www\.)?spankwire\.com/
	18	(?:
	19	[^/]+/video\|
	20	EmbedPlayer\.aspx/?\?.*?\bArticleId=
	21	)
	22	(?P<id>\d+)
	23	'''
bfd973ec	24	_EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?spankwire\.com/EmbedPlayer\.aspx/?\?.*?\bArticleId=\d+)']
59e6acc7	25	_TESTS = [{
551c7837 S	26	# download URL pattern: */<height>P_<tbr>K_<video_id>.mp4
551c7837 S	27	'url': 'http://www.spankwire.com/Buckcherry-s-X-Rated-Music-Video-Crazy-Bitch/video103545/',
d44a707f	28	'md5': '5aa0e4feef20aad82cbcae3aed7ab7cd',
551c7837 S	29	'info_dict': {
	30	'id': '103545',
	31	'ext': 'mp4',
	32	'title': 'Buckcherry`s X Rated Music Video Crazy Bitch',
	33	'description': 'Crazy Bitch X rated music video.',
d44a707f	34	'duration': 222,
551c7837 S	35	'uploader': 'oreusz',
551c7837 S	36	'uploader_id': '124697',
d44a707f S	37	'timestamp': 1178587885,
	38	'upload_date': '20070508',
	39	'average_rating': float,
	40	'view_count': int,
	41	'comment_count': int,
551c7837	42	'age_limit': 18,
d44a707f S	43	'categories': list,
	44	'tags': list,
	45	},
551c7837 S	46	}, {
	47	# download URL pattern: */mp4_<format_id>_<video_id>.mp4
	48	'url': 'http://www.spankwire.com/Titcums-Compiloation-I/video1921551/',
	49	'md5': '09b3c20833308b736ae8902db2f8d7e6',
	50	'info_dict': {
	51	'id': '1921551',
	52	'ext': 'mp4',
	53	'title': 'Titcums Compiloation I',
	54	'description': 'cum on tits',
	55	'uploader': 'dannyh78999',
	56	'uploader_id': '3056053',
	57	'upload_date': '20150822',
	58	'age_limit': 18,
	59	},
d44a707f S	60	'params': {
	61	'proxy': '127.0.0.1:8118'
	62	},
	63	'skip': 'removed',
	64	}, {
	65	'url': 'https://www.spankwire.com/EmbedPlayer.aspx/?ArticleId=156156&autostart=true',
	66	'only_matching': True,
551c7837	67	}]
7b2212e9	68
7b2212e9	69	def _real_extract(self, url):
d44a707f	70	video_id = self._match_id(url)
7b2212e9	71
d44a707f S	72	video = self._download_json(
d44a707f S	73	'https://www.spankwire.com/api/video/%s.json' % video_id, video_id)
9767726b	74
d44a707f	75	title = video['title']
7b2212e9	76
d44a707f S	77	formats = []
	78	videos = video.get('videos')
	79	if isinstance(videos, dict):
	80	for format_id, format_url in videos.items():
	81	video_url = url_or_none(format_url)
	82	if not format_url:
	83	continue
	84	height = int_or_none(self._search_regex(
	85	r'(\d+)[pP]', format_id, 'height', default=None))
	86	m = re.search(
	87	r'/(?P<height>\d+)[pP]_(?P<tbr>\d+)[kK]', video_url)
	88	if m:
	89	tbr = int(m.group('tbr'))
	90	height = height or int(m.group('height'))
	91	else:
	92	tbr = None
	93	formats.append({
	94	'url': video_url,
	95	'format_id': '%dp' % height if height else format_id,
	96	'height': height,
	97	'tbr': tbr,
	98	})
	99	m3u8_url = url_or_none(video.get('HLS'))
	100	if m3u8_url:
	101	formats.extend(self._extract_m3u8_formats(
	102	m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native',
	103	m3u8_id='hls', fatal=False))
fdb4d278	104
d44a707f	105	view_count = str_to_int(video.get('viewed'))
7b2212e9	106
d44a707f S	107	thumbnails = []
	108	for preference, t in enumerate(('', '2x'), start=0):
	109	thumbnail_url = url_or_none(video.get('poster%s' % t))
	110	if not thumbnail_url:
	111	continue
	112	thumbnails.append({
	113	'url': thumbnail_url,
	114	'preference': preference,
6a1df4fb	115	})
7b2212e9	116
d44a707f S	117	def extract_names(key):
	118	entries_list = video.get(key)
	119	if not isinstance(entries_list, list):
	120	return
	121	entries = []
	122	for entry in entries_list:
	123	name = str_or_none(entry.get('name'))
	124	if name:
	125	entries.append(name)
	126	return entries
	127
	128	categories = extract_names('categories')
	129	tags = extract_names('tags')
	130
	131	uploader = None
	132	info = {}
750e9833	133
d44a707f S	134	webpage = self._download_webpage(
	135	'https://www.spankwire.com/_/video%s/' % video_id, video_id,
	136	fatal=False)
	137	if webpage:
	138	info = self._search_json_ld(webpage, video_id, default={})
	139	thumbnail_url = None
	140	if 'thumbnail' in info:
	141	thumbnail_url = url_or_none(info['thumbnail'])
	142	del info['thumbnail']
	143	if not thumbnail_url:
	144	thumbnail_url = self._og_search_thumbnail(webpage)
	145	if thumbnail_url:
	146	thumbnails.append({
	147	'url': thumbnail_url,
	148	'preference': 10,
	149	})
	150	uploader = self._html_search_regex(
	151	r'(?s)by\s<a[^>]+\bclass=["\']uploaded__by[^>]>(.+?)</a>',
	152	webpage, 'uploader', fatal=False)
	153	if not view_count:
	154	view_count = str_to_int(self._search_regex(
	155	r'data-views=["\']([\d,.]+)', webpage, 'view count',
	156	fatal=False))
	157
	158	return merge_dicts({
7b2212e9	159	'id': video_id,
9767726b	160	'title': title,
d44a707f S	161	'description': video.get('description'),
	162	'duration': int_or_none(video.get('duration')),
	163	'thumbnails': thumbnails,
9767726b	164	'uploader': uploader,
d44a707f S	165	'uploader_id': str_or_none(video.get('userId')),
	166	'timestamp': int_or_none(video.get('time_approved_on')),
	167	'average_rating': float_or_none(video.get('rating')),
9767726b	168	'view_count': view_count,
d44a707f S	169	'comment_count': int_or_none(video.get('comments')),
	170	'age_limit': 18,
	171	'categories': categories,
	172	'tags': tags,
7b2212e9	173	'formats': formats,
d44a707f	174	}, info)