[yt-dlp.git] / youtube_dl / extractor / bliptv.py

from __future__ import unicode_literals

import re

from .common import InfoExtractor
from .subtitles import SubtitlesInfoExtractor
from ..utils import (
    compat_urllib_request,
    unescapeHTML,
    parse_iso8601,
    compat_urlparse,
    clean_html,
    compat_str,
)


class BlipTVIE(SubtitlesInfoExtractor):
    _VALID_URL = r'https?://(?:\w+\.)?blip\.tv/(?:(?:.+-|rss/flash/)(?P<id>\d+)|((?:play/|api\.swf#)(?P<lookup_id>[\da-zA-Z+]+)))'

    _TESTS = [
        {
            'url': 'http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352',
            'md5': 'c6934ad0b6acf2bd920720ec888eb812',
            'info_dict': {
                'id': '5779306',
                'ext': 'mov',
                'title': 'CBR EXCLUSIVE: "Gotham City Imposters" Bats VS Jokerz Short 3',
                'description': 'md5:9bc31f227219cde65e47eeec8d2dc596',
                'timestamp': 1323138843,
                'upload_date': '20111206',
                'uploader': 'cbr',
                'uploader_id': '679425',
                'duration': 81,
            }
        },
        {
            # https://github.com/rg3/youtube-dl/pull/2274
            'note': 'Video with subtitles',
            'url': 'http://blip.tv/play/h6Uag5OEVgI.html',
            'md5': '309f9d25b820b086ca163ffac8031806',
            'info_dict': {
                'id': '6586561',
                'ext': 'mp4',
                'title': 'Red vs. Blue Season 11 Episode 1',
                'description': 'One-Zero-One',
                'timestamp': 1371261608,
                'upload_date': '20130615',
                'uploader': 'redvsblue',
                'uploader_id': '792887',
                'duration': 279,
            }
        }
    ]

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        lookup_id = mobj.group('lookup_id')

        # See https://github.com/rg3/youtube-dl/issues/857
        if lookup_id:
            info_page = self._download_webpage(
                'http://blip.tv/play/%s.x?p=1' % lookup_id, lookup_id, 'Resolving lookup id')
            video_id = self._search_regex(r'data-episode-id="([0-9]+)', info_page, 'video_id')
        else:
            video_id = mobj.group('id')

        rss = self._download_xml('http://blip.tv/rss/flash/%s' % video_id, video_id, 'Downloading video RSS')

        def blip(s):
            return '{http://blip.tv/dtd/blip/1.0}%s' % s

        def media(s):
            return '{http://search.yahoo.com/mrss/}%s' % s

        def itunes(s):
            return '{http://www.itunes.com/dtds/podcast-1.0.dtd}%s' % s

        item = rss.find('channel/item')

        video_id = item.find(blip('item_id')).text
        title = item.find('./title').text
        description = clean_html(compat_str(item.find(blip('puredescription')).text))
        timestamp = parse_iso8601(item.find(blip('datestamp')).text)
        uploader = item.find(blip('user')).text
        uploader_id = item.find(blip('userid')).text
        duration = int(item.find(blip('runtime')).text)
        media_thumbnail = item.find(media('thumbnail'))
        thumbnail = media_thumbnail.get('url') if media_thumbnail is not None else item.find(itunes('image')).text
        categories = [category.text for category in item.findall('category')]

        formats = []
        subtitles = {}

        media_group = item.find(media('group'))
        for media_content in media_group.findall(media('content')):
            url = media_content.get('url')
            role = media_content.get(blip('role'))
            msg = self._download_webpage(
                url + '?showplayer=20140425131715&referrer=http://blip.tv&mask=7&skin=flashvars&view=url',
                video_id, 'Resolving URL for %s' % role)
            real_url = compat_urlparse.parse_qs(msg)['message'][0]

            media_type = media_content.get('type')
            if media_type == 'text/srt' or url.endswith('.srt'):
                LANGS = {
                    'english': 'en',
                }
                lang = role.rpartition('-')[-1].strip().lower()
                langcode = LANGS.get(lang, lang)
                subtitles[langcode] = url
            elif media_type.startswith('video/'):
                formats.append({
                    'url': real_url,
                    'format_id': role,
                    'format_note': media_type,
                    'vcodec': media_content.get(blip('vcodec')),
                    'acodec': media_content.get(blip('acodec')),
                    'filesize': media_content.get('filesize'),
                    'width': int(media_content.get('width')),
                    'height': int(media_content.get('height')),
                })
        self._sort_formats(formats)

        # subtitles
        video_subtitles = self.extract_subtitles(video_id, subtitles)
        if self._downloader.params.get('listsubtitles', False):
            self._list_available_subtitles(video_id, subtitles)
            return

        return {
            'id': video_id,
            'title': title,
            'description': description,
            'timestamp': timestamp,
            'uploader': uploader,
            'uploader_id': uploader_id,
            'duration': duration,
            'thumbnail': thumbnail,
            'categories': categories,
            'formats': formats,
            'subtitles': video_subtitles,
        }

    def _download_subtitle_url(self, sub_lang, url):
        # For some weird reason, blip.tv serves a video instead of subtitles
        # when we request with a common UA
        req = compat_urllib_request.Request(url)
        req.add_header('Youtubedl-user-agent', 'youtube-dl')
        return self._download_webpage(req, None, note=False)


class BlipTVUserIE(InfoExtractor):
    _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
    _PAGE_SIZE = 12
    IE_NAME = 'blip.tv:user'

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        username = mobj.group(1)

        page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'

        page = self._download_webpage(url, username, 'Downloading user page')
        mobj = re.search(r'data-users-id="([^"]+)"', page)
        page_base = page_base % mobj.group(1)

        # Download video ids using BlipTV Ajax calls. Result size per
        # query is limited (currently to 12 videos) so we need to query
        # page by page until there are no video ids - it means we got
        # all of them.

        video_ids = []
        pagenum = 1

        while True:
            url = page_base + "&page=" + str(pagenum)
            page = self._download_webpage(
                url, username, 'Downloading video ids from page %d' % pagenum)

            # Extract video identifiers
            ids_in_page = []

            for mobj in re.finditer(r'href="/([^"]+)"', page):
                if mobj.group(1) not in ids_in_page:
                    ids_in_page.append(unescapeHTML(mobj.group(1)))

            video_ids.extend(ids_in_page)

            # A little optimization - if current page is not
            # "full", ie. does not contain PAGE_SIZE video ids then
            # we can assume that this page is the last one - there
            # are no more ids on further pages - no need to query
            # again.

            if len(ids_in_page) < self._PAGE_SIZE:
                break

            pagenum += 1

        urls = ['http://blip.tv/%s' % video_id for video_id in video_ids]
        url_entries = [self.url_result(vurl, 'BlipTV') for vurl in urls]
        return [self.playlist_result(url_entries, playlist_title=username)]
Commit	Line	Data
bca4e930 PH	1	from __future__ import unicode_literals
bca4e930 PH	2
f5884801	3	import re
f5884801 PH	4
f5884801 PH	5	from .common import InfoExtractor
b4bcffef	6	from .subtitles import SubtitlesInfoExtractor
f5884801	7	from ..utils import (
f5884801	8	compat_urllib_request,
f5884801	9	unescapeHTML,
481efc84 S	10	parse_iso8601,
	11	compat_urlparse,
	12	clean_html,
	13	compat_str,
f5884801 PH	14	)
	15
	16
b4bcffef	17	class BlipTVIE(SubtitlesInfoExtractor):
b7c33124	18	_VALID_URL = r'https?://(?:\w+\.)?blip\.tv/(?:(?:.+-\|rss/flash/)(?P<id>\d+)\|((?:play/\|api\.swf#)(?P<lookup_id>[\da-zA-Z+]+)))'
481efc84 S	19
	20	_TESTS = [
	21	{
	22	'url': 'http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352',
	23	'md5': 'c6934ad0b6acf2bd920720ec888eb812',
	24	'info_dict': {
	25	'id': '5779306',
	26	'ext': 'mov',
	27	'title': 'CBR EXCLUSIVE: "Gotham City Imposters" Bats VS Jokerz Short 3',
	28	'description': 'md5:9bc31f227219cde65e47eeec8d2dc596',
	29	'timestamp': 1323138843,
	30	'upload_date': '20111206',
	31	'uploader': 'cbr',
	32	'uploader_id': '679425',
	33	'duration': 81,
	34	}
	35	},
	36	{
	37	# https://github.com/rg3/youtube-dl/pull/2274
	38	'note': 'Video with subtitles',
	39	'url': 'http://blip.tv/play/h6Uag5OEVgI.html',
	40	'md5': '309f9d25b820b086ca163ffac8031806',
	41	'info_dict': {
	42	'id': '6586561',
	43	'ext': 'mp4',
	44	'title': 'Red vs. Blue Season 11 Episode 1',
	45	'description': 'One-Zero-One',
	46	'timestamp': 1371261608,
	47	'upload_date': '20130615',
	48	'uploader': 'redvsblue',
	49	'uploader_id': '792887',
	50	'duration': 279,
	51	}
b4bcffef	52	}
481efc84	53	]
f5884801 PH	54
	55	def _real_extract(self, url):
	56	mobj = re.match(self._VALID_URL, url)
481efc84	57	lookup_id = mobj.group('lookup_id')
f5884801 PH	58
f5884801 PH	59	# See https://github.com/rg3/youtube-dl/issues/857
481efc84 S	60	if lookup_id:
	61	info_page = self._download_webpage(
	62	'http://blip.tv/play/%s.x?p=1' % lookup_id, lookup_id, 'Resolving lookup id')
	63	video_id = self._search_regex(r'data-episode-id="([0-9]+)', info_page, 'video_id')
b4bcffef	64	else:
481efc84 S	65	video_id = mobj.group('id')
	66
	67	rss = self._download_xml('http://blip.tv/rss/flash/%s' % video_id, video_id, 'Downloading video RSS')
	68
	69	def blip(s):
	70	return '{http://blip.tv/dtd/blip/1.0}%s' % s
	71
	72	def media(s):
	73	return '{http://search.yahoo.com/mrss/}%s' % s
	74
	75	def itunes(s):
	76	return '{http://www.itunes.com/dtds/podcast-1.0.dtd}%s' % s
	77
	78	item = rss.find('channel/item')
	79
	80	video_id = item.find(blip('item_id')).text
	81	title = item.find('./title').text
	82	description = clean_html(compat_str(item.find(blip('puredescription')).text))
	83	timestamp = parse_iso8601(item.find(blip('datestamp')).text)
	84	uploader = item.find(blip('user')).text
	85	uploader_id = item.find(blip('userid')).text
	86	duration = int(item.find(blip('runtime')).text)
	87	media_thumbnail = item.find(media('thumbnail'))
	88	thumbnail = media_thumbnail.get('url') if media_thumbnail is not None else item.find(itunes('image')).text
	89	categories = [category.text for category in item.findall('category')]
b4bcffef	90
b4bcffef	91	formats = []
481efc84 S	92	subtitles = {}
	93
	94	media_group = item.find(media('group'))
	95	for media_content in media_group.findall(media('content')):
	96	url = media_content.get('url')
	97	role = media_content.get(blip('role'))
	98	msg = self._download_webpage(
	99	url + '?showplayer=20140425131715&referrer=http://blip.tv&mask=7&skin=flashvars&view=url',
	100	video_id, 'Resolving URL for %s' % role)
	101	real_url = compat_urlparse.parse_qs(msg)['message'][0]
	102
	103	media_type = media_content.get('type')
	104	if media_type == 'text/srt' or url.endswith('.srt'):
	105	LANGS = {
	106	'english': 'en',
	107	}
	108	lang = role.rpartition('-')[-1].strip().lower()
	109	langcode = LANGS.get(lang, lang)
	110	subtitles[langcode] = url
	111	elif media_type.startswith('video/'):
531147dd	112	formats.append({
481efc84 S	113	'url': real_url,
	114	'format_id': role,
	115	'format_note': media_type,
	116	'vcodec': media_content.get(blip('vcodec')),
	117	'acodec': media_content.get(blip('acodec')),
	118	'filesize': media_content.get('filesize'),
	119	'width': int(media_content.get('width')),
	120	'height': int(media_content.get('height')),
531147dd	121	})
b4bcffef PH	122	self._sort_formats(formats)
	123
	124	# subtitles
	125	video_subtitles = self.extract_subtitles(video_id, subtitles)
	126	if self._downloader.params.get('listsubtitles', False):
	127	self._list_available_subtitles(video_id, subtitles)
	128	return
	129
	130	return {
	131	'id': video_id,
481efc84 S	132	'title': title,
	133	'description': description,
	134	'timestamp': timestamp,
	135	'uploader': uploader,
	136	'uploader_id': uploader_id,
	137	'duration': duration,
	138	'thumbnail': thumbnail,
	139	'categories': categories,
b4bcffef PH	140	'formats': formats,
	141	'subtitles': video_subtitles,
	142	}
466617f5	143
b4bcffef PH	144	def _download_subtitle_url(self, sub_lang, url):
	145	# For some weird reason, blip.tv serves a video instead of subtitles
	146	# when we request with a common UA
	147	req = compat_urllib_request.Request(url)
	148	req.add_header('Youtubedl-user-agent', 'youtube-dl')
	149	return self._download_webpage(req, None, note=False)
f5884801 PH	150
	151
	152	class BlipTVUserIE(InfoExtractor):
f5884801 PH	153	_VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)\|bliptvuser:)([^/]+)/*$'
f5884801 PH	154	_PAGE_SIZE = 12
bca4e930	155	IE_NAME = 'blip.tv:user'
f5884801 PH	156
f5884801 PH	157	def _real_extract(self, url):
f5884801	158	mobj = re.match(self._VALID_URL, url)
f5884801 PH	159	username = mobj.group(1)
	160
	161	page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
	162
bca4e930	163	page = self._download_webpage(url, username, 'Downloading user page')
f5884801 PH	164	mobj = re.search(r'data-users-id="([^"]+)"', page)
	165	page_base = page_base % mobj.group(1)
	166
f5884801 PH	167	# Download video ids using BlipTV Ajax calls. Result size per
	168	# query is limited (currently to 12 videos) so we need to query
	169	# page by page until there are no video ids - it means we got
	170	# all of them.
	171
	172	video_ids = []
	173	pagenum = 1
	174
	175	while True:
	176	url = page_base + "&page=" + str(pagenum)
b4bcffef PH	177	page = self._download_webpage(
b4bcffef PH	178	url, username, 'Downloading video ids from page %d' % pagenum)
f5884801 PH	179
	180	# Extract video identifiers
	181	ids_in_page = []
	182
	183	for mobj in re.finditer(r'href="/([^"]+)"', page):
	184	if mobj.group(1) not in ids_in_page:
	185	ids_in_page.append(unescapeHTML(mobj.group(1)))
	186
	187	video_ids.extend(ids_in_page)
	188
	189	# A little optimization - if current page is not
	190	# "full", ie. does not contain PAGE_SIZE video ids then
	191	# we can assume that this page is the last one - there
	192	# are no more ids on further pages - no need to query
	193	# again.
	194
	195	if len(ids_in_page) < self._PAGE_SIZE:
	196	break
	197
	198	pagenum += 1
	199
bca4e930	200	urls = ['http://blip.tv/%s' % video_id for video_id in video_ids]
20c3893f	201	url_entries = [self.url_result(vurl, 'BlipTV') for vurl in urls]
b4bcffef	202	return [self.playlist_result(url_entries, playlist_title=username)]