[yt-dlp.git] / youtube_dl / extractor / sexykarma.py

# coding: utf-8
from __future__ import unicode_literals

from .common import InfoExtractor
import re
import datetime

class SexyKarmaIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?sexykarma\.com/gonewild/video/.+\-(?P<id>[a-zA-Z0-9\-]+)(.html)'
    _TESTS = [{
        'url': 'http://www.sexykarma.com/gonewild/video/taking-a-quick-pee-yHI70cOyIHt.html',
        'md5': 'b9798e7d1ef1765116a8f516c8091dbd',
        'info_dict': {
            'id': 'yHI70cOyIHt',
            'ext': 'mp4',
            'title': 'Taking a quick pee.',
            'uploader_id': 'wildginger7',
            'thumbnail': 're:^https?://.*\.jpg$',
            'duration': int,
            'view_count': int,
            'upload_date': '20141007',
        }
    }, {
        'url': 'http://www.sexykarma.com/gonewild/video/pot-pixie-tribute-8Id6EZPbuHf.html',
        'md5': 'dd216c68d29b49b12842b9babe762a5d',
        'info_dict': {
            'id': '8Id6EZPbuHf',
            'ext': 'mp4',
            'title': 'pot_pixie tribute',
            'uploader_id': 'banffite',
            'thumbnail': 're:^https?://.*\.jpg$',
            'duration': int,
            'view_count': int,
            'upload_date': '20141013',
        }
    }]

    def _real_extract(self, url):
        video_id = self._match_id(url)

        webpage = self._download_webpage(url, video_id)
              
        title = self._html_search_regex(r'<h2 class="he2"><span>(.*?)</span>', webpage, 'title')
        uploader_id = self._html_search_regex(r'class="aupa">\n*(.*?)</a>', webpage, 'uploader')
        url = self._html_search_regex(r'<p><a href="(.*?)" ?\n*target="_blank"><font color', webpage, 'url')
        thumbnail = self._html_search_regex(r'<div id="player" style="z-index:1;"> <span id="edge"></span> <span id="container"><img[\n ]*src="(.+?)"', webpage, 'thumbnail')
        
        str_duration = self._html_search_regex(r'<tr>[\n\s]*<td>Time: </td>[\n\s]*<td align="right"><span>(.+)\n*', webpage, 'duration')
        duration = self._to_seconds(str_duration)

        str_views = self._html_search_regex(r'<tr>[\n\s]*<td>Views: </td>[\n\s]*<td align="right"><span>(.+)</span>', webpage, 'view_count')
        view_count = int(str_views)
        # print view_count

        date = self._html_search_regex(r'class="aup">Added: <strong>(.*?)</strong>', webpage, 'date')
        d = datetime.datetime.strptime(date, '%B %d, %Y')
        upload_date = d.strftime('%Y%m%d')

        categories = re.findall(r'http://www.sexykarma.com/gonewild/search/video/(?:.+?)"><span>(.*?)</span>', webpage)

        return {
            'id': video_id,
            'title': title,
            'uploader_id': uploader_id,
            'url': url,
            'thumbnail': thumbnail,
            'duration': duration,
            'view_count': view_count,
            'upload_date': upload_date,
            'categories': categories,
        }

    def _to_seconds(self, timestr):
        seconds= 0
        for part in timestr.split(':'):
            seconds= seconds*60 + int(part)
        return seconds
Commit	Line	Data
47408645 C	1	# coding: utf-8
	2	from __future__ import unicode_literals
	3
	4	from .common import InfoExtractor
1723edb1 C	5	import re
1723edb1 C	6	import datetime
47408645 C	7
47408645 C	8	class SexyKarmaIE(InfoExtractor):
1723edb1	9	_VALID_URL = r'https?://(?:www\.)?sexykarma\.com/gonewild/video/.+\-(?P<id>[a-zA-Z0-9\-]+)(.html)'
47408645 C	10	_TESTS = [{
	11	'url': 'http://www.sexykarma.com/gonewild/video/taking-a-quick-pee-yHI70cOyIHt.html',
	12	'md5': 'b9798e7d1ef1765116a8f516c8091dbd',
	13	'info_dict': {
1723edb1	14	'id': 'yHI70cOyIHt',
47408645 C	15	'ext': 'mp4',
47408645 C	16	'title': 'Taking a quick pee.',
1723edb1 C	17	'uploader_id': 'wildginger7',
	18	'thumbnail': 're:^https?://.*\.jpg$',
	19	'duration': int,
	20	'view_count': int,
	21	'upload_date': '20141007',
47408645 C	22	}
	23	}, {
	24	'url': 'http://www.sexykarma.com/gonewild/video/pot-pixie-tribute-8Id6EZPbuHf.html',
	25	'md5': 'dd216c68d29b49b12842b9babe762a5d',
	26	'info_dict': {
1723edb1	27	'id': '8Id6EZPbuHf',
47408645 C	28	'ext': 'mp4',
47408645 C	29	'title': 'pot_pixie tribute',
1723edb1 C	30	'uploader_id': 'banffite',
	31	'thumbnail': 're:^https?://.*\.jpg$',
	32	'duration': int,
	33	'view_count': int,
	34	'upload_date': '20141013',
47408645 C	35	}
	36	}]
	37
	38	def _real_extract(self, url):
	39	video_id = self._match_id(url)
	40
47408645	41	webpage = self._download_webpage(url, video_id)
1723edb1	42
47408645	43	title = self._html_search_regex(r'<h2 class="he2"><span>(.*?)</span>', webpage, 'title')
1723edb1	44	uploader_id = self._html_search_regex(r'class="aupa">\n(.?)</a>', webpage, 'uploader')
47408645	45	url = self._html_search_regex(r'<p><a href="(.?)" ?\ntarget="_blank"><font color', webpage, 'url')
1723edb1 C	46	thumbnail = self._html_search_regex(r'<div id="player" style="z-index:1;"> <span id="edge"></span> <span id="container"><img[\n ]*src="(.+?)"', webpage, 'thumbnail')
	47
	48	str_duration = self._html_search_regex(r'<tr>[\n\s]<td>Time: </td>[\n\s]<td align="right"><span>(.+)\n*', webpage, 'duration')
	49	duration = self._to_seconds(str_duration)
	50
	51	str_views = self._html_search_regex(r'<tr>[\n\s]<td>Views: </td>[\n\s]<td align="right"><span>(.+)</span>', webpage, 'view_count')
	52	view_count = int(str_views)
	53	# print view_count
	54
	55	date = self._html_search_regex(r'class="aup">Added: <strong>(.*?)</strong>', webpage, 'date')
	56	d = datetime.datetime.strptime(date, '%B %d, %Y')
	57	upload_date = d.strftime('%Y%m%d')
47408645	58
7da224c9 C	59	categories = re.findall(r'http://www.sexykarma.com/gonewild/search/video/(?:.+?)"><span>(.*?)</span>', webpage)
7da224c9 C	60
47408645 C	61	return {
	62	'id': video_id,
	63	'title': title,
1723edb1 C	64	'uploader_id': uploader_id,
	65	'url': url,
	66	'thumbnail': thumbnail,
	67	'duration': duration,
	68	'view_count': view_count,
	69	'upload_date': upload_date,
7da224c9	70	'categories': categories,
47408645	71	}
1723edb1 C	72
	73	def _to_seconds(self, timestr):
	74	seconds= 0
	75	for part in timestr.split(':'):
	76	seconds= seconds*60 + int(part)
	77	return seconds