[yt-dlp.git] / youtube_dl / extractor / jukebox.py

from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..utils import (
    ExtractorError,
    RegexNotFoundError,
    unescapeHTML,
)


class JukeboxIE(InfoExtractor):
    _VALID_URL = r'^http://www\.jukebox?\..+?\/.+[,](?P<id>[a-z0-9\-]+)\.html'
    _TEST = {
        'url': 'http://www.jukebox.es/kosheen/videoclip,pride,r303r.html',
        'info_dict': {
            'id': 'r303r',
            'ext': 'flv',
            'title': 'Kosheen-En Vivo Pride',
            'uploader': 'Kosheen',
        },
    }

    def _real_extract(self, url):
        video_id = self._match_id(url)

        html = self._download_webpage(url, video_id)
        iframe_url = unescapeHTML(self._search_regex(r'<iframe .*src="([^"]*)"', html, 'iframe url'))

        iframe_html = self._download_webpage(iframe_url, video_id, 'Downloading iframe')
        if re.search(r'class="jkb_waiting"', iframe_html) is not None:
            raise ExtractorError('Video is not available(in your country?)!')

        self.report_extraction(video_id)

        try:
            video_url = self._search_regex(r'"config":{"file":"(?P<video_url>http:[^"]+\?mdtk=[0-9]+)"',
                                           iframe_html, 'video url')
            video_url = unescapeHTML(video_url).replace('\/', '/')
        except RegexNotFoundError:
            youtube_url = self._search_regex(
                r'config":{"file":"(http:\\/\\/www\.youtube\.com\\/watch\?v=[^"]+)"',
                iframe_html, 'youtube url')
            youtube_url = unescapeHTML(youtube_url).replace('\/', '/')
            self.to_screen('Youtube video detected')
            return self.url_result(youtube_url, ie='Youtube')

        title = self._html_search_regex(r'<h1 class="inline">([^<]+)</h1>',
                                        html, 'title')
        artist = self._html_search_regex(r'<span id="infos_article_artist">([^<]+)</span>',
                                         html, 'artist')

        return {
            'id': video_id,
            'url': video_url,
            'title': artist + '-' + title,
            'uploader': artist,
        }
Commit	Line	Data
b4a186b7 JMF	1	from __future__ import unicode_literals
b4a186b7 JMF	2
28ef06f7	3	import re
	4
	5	from .common import InfoExtractor
	6	from ..utils import (
	7	ExtractorError,
b4a186b7	8	RegexNotFoundError,
28ef06f7	9	unescapeHTML,
	10	)
	11
b4a186b7	12
28ef06f7	13	class JukeboxIE(InfoExtractor):
937f935d	14	_VALID_URL = r'^http://www\.jukebox?\..+?\/.+[,](?P<id>[a-z0-9\-]+)\.html'
b4a186b7 JMF	15	_TEST = {
b4a186b7 JMF	16	'url': 'http://www.jukebox.es/kosheen/videoclip,pride,r303r.html',
b4a186b7 JMF	17	'info_dict': {
	18	'id': 'r303r',
	19	'ext': 'flv',
	20	'title': 'Kosheen-En Vivo Pride',
	21	'uploader': 'Kosheen',
	22	},
	23	}
28ef06f7	24
28ef06f7	25	def _real_extract(self, url):
937f935d	26	video_id = self._match_id(url)
28ef06f7	27
28ef06f7	28	html = self._download_webpage(url, video_id)
b4a186b7	29	iframe_url = unescapeHTML(self._search_regex(r'<iframe .src="([^"])"', html, 'iframe url'))
28ef06f7	30
28ef06f7	31	iframe_html = self._download_webpage(iframe_url, video_id, 'Downloading iframe')
b4a186b7 JMF	32	if re.search(r'class="jkb_waiting"', iframe_html) is not None:
b4a186b7 JMF	33	raise ExtractorError('Video is not available(in your country?)!')
28ef06f7	34
	35	self.report_extraction(video_id)
	36
b4a186b7 JMF	37	try:
b4a186b7 JMF	38	video_url = self._search_regex(r'"config":{"file":"(?P<video_url>http:[^"]+\?mdtk=[0-9]+)"',
9e1a5b84	39	iframe_html, 'video url')
b4a186b7 JMF	40	video_url = unescapeHTML(video_url).replace('\/', '/')
	41	except RegexNotFoundError:
	42	youtube_url = self._search_regex(
	43	r'config":{"file":"(http:\\/\\/www\.youtube\.com\\/watch\?v=[^"]+)"',
	44	iframe_html, 'youtube url')
	45	youtube_url = unescapeHTML(youtube_url).replace('\/', '/')
	46	self.to_screen('Youtube video detected')
	47	return self.url_result(youtube_url, ie='Youtube')
	48
	49	title = self._html_search_regex(r'<h1 class="inline">([^<]+)</h1>',
9e1a5b84	50	html, 'title')
b4a186b7	51	artist = self._html_search_regex(r'<span id="infos_article_artist">([^<]+)</span>',
9e1a5b84	52	html, 'artist')
b4a186b7 JMF	53
	54	return {
	55	'id': video_id,
	56	'url': video_url,
	57	'title': artist + '-' + title,
	58	'uploader': artist,
	59	}