[yt-dlp.git] / youtube_dl / extractor / channel9.py

from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..utils import (
    ExtractorError,
    parse_filesize,
    qualities,
)


class Channel9IE(InfoExtractor):
    '''
    Common extractor for channel9.msdn.com.

    The type of provided URL (video or playlist) is determined according to
    meta Search.PageType from web page HTML rather than URL itself, as it is
    not always possible to do.
    '''
    IE_DESC = 'Channel 9'
    IE_NAME = 'channel9'
    _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'

    _TESTS = [
        {
            'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
            'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
            'info_dict': {
                'id': 'Events/TechEd/Australia/2013/KOS002',
                'ext': 'mp4',
                'title': 'Developer Kick-Off Session: Stuff We Love',
                'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
                'duration': 4576,
                'thumbnail': 're:http://.*\.jpg',
                'session_code': 'KOS002',
                'session_day': 'Day 1',
                'session_room': 'Arena 1A',
                'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen'],
            },
        },
        {
            'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
            'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
            'info_dict': {
                'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
                'ext': 'mp4',
                'title': 'Self-service BI with Power BI - nuclear testing',
                'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
                'duration': 1540,
                'thumbnail': 're:http://.*\.jpg',
                'authors': ['Mike Wilmot'],
            },
        },
        {
            # low quality mp4 is best
            'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
            'info_dict': {
                'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
                'ext': 'mp4',
                'title': 'Ranges for the Standard Library',
                'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d',
                'duration': 5646,
                'thumbnail': 're:http://.*\.jpg',
            },
            'params': {
                'skip_download': True,
            },
        }
    ]

    _RSS_URL = 'http://channel9.msdn.com/%s/RSS'

    def _formats_from_html(self, html):
        FORMAT_REGEX = r'''
            (?x)
            <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
            <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
            (?:<div\s+class="popup\s+rounded">\s*
            <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
            </div>)?                                                # File size part may be missing
        '''
        quality = qualities((
            'MP3', 'MP4',
            'Low Quality WMV', 'Low Quality MP4',
            'Mid Quality WMV', 'Mid Quality MP4',
            'High Quality WMV', 'High Quality MP4'))
        formats = [{
            'url': x.group('url'),
            'format_id': x.group('quality'),
            'format_note': x.group('note'),
            'format': '%s (%s)' % (x.group('quality'), x.group('note')),
            'filesize_approx': parse_filesize(x.group('filesize')),
            'quality': quality(x.group('quality')),
            'vcodec': 'none' if x.group('note') == 'Audio only' else None,
        } for x in list(re.finditer(FORMAT_REGEX, html))]

        self._sort_formats(formats)

        return formats

    def _extract_title(self, html):
        title = self._html_search_meta('title', html, 'title')
        if title is None:
            title = self._og_search_title(html)
            TITLE_SUFFIX = ' (Channel 9)'
            if title is not None and title.endswith(TITLE_SUFFIX):
                title = title[:-len(TITLE_SUFFIX)]
        return title

    def _extract_description(self, html):
        DESCRIPTION_REGEX = r'''(?sx)
            <div\s+class="entry-content">\s*
            <div\s+id="entry-body">\s*
            (?P<description>.+?)\s*
            </div>\s*
            </div>
        '''
        m = re.search(DESCRIPTION_REGEX, html)
        if m is not None:
            return m.group('description')
        return self._html_search_meta('description', html, 'description')

    def _extract_duration(self, html):
        m = re.search(r'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
        return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None

    def _extract_slides(self, html):
        m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
        return m.group('slidesurl') if m is not None else None

    def _extract_zip(self, html):
        m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
        return m.group('zipurl') if m is not None else None

    def _extract_avg_rating(self, html):
        m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
        return float(m.group('avgrating')) if m is not None else 0

    def _extract_rating_count(self, html):
        m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
        return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0

    def _extract_view_count(self, html):
        m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
        return int(self._fix_count(m.group('viewcount'))) if m is not None else 0

    def _extract_comment_count(self, html):
        m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
        return int(self._fix_count(m.group('commentcount'))) if m is not None else 0

    def _fix_count(self, count):
        return int(str(count).replace(',', '')) if count is not None else None

    def _extract_authors(self, html):
        m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
        if m is None:
            return None
        return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))

    def _extract_session_code(self, html):
        m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
        return m.group('code') if m is not None else None

    def _extract_session_day(self, html):
        m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
        return m.group('day').strip() if m is not None else None

    def _extract_session_room(self, html):
        m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
        return m.group('room') if m is not None else None

    def _extract_session_speakers(self, html):
        return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)

    def _extract_content(self, html, content_path):
        # Look for downloadable content
        formats = self._formats_from_html(html)
        slides = self._extract_slides(html)
        zip_ = self._extract_zip(html)

        # Nothing to download
        if len(formats) == 0 and slides is None and zip_ is None:
            self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path)
            return

        # Extract meta
        title = self._extract_title(html)
        description = self._extract_description(html)
        thumbnail = self._og_search_thumbnail(html)
        duration = self._extract_duration(html)
        avg_rating = self._extract_avg_rating(html)
        rating_count = self._extract_rating_count(html)
        view_count = self._extract_view_count(html)
        comment_count = self._extract_comment_count(html)

        common = {
            '_type': 'video',
            'id': content_path,
            'description': description,
            'thumbnail': thumbnail,
            'duration': duration,
            'avg_rating': avg_rating,
            'rating_count': rating_count,
            'view_count': view_count,
            'comment_count': comment_count,
        }

        result = []

        if slides is not None:
            d = common.copy()
            d.update({'title': title + '-Slides', 'url': slides})
            result.append(d)

        if zip_ is not None:
            d = common.copy()
            d.update({'title': title + '-Zip', 'url': zip_})
            result.append(d)

        if len(formats) > 0:
            d = common.copy()
            d.update({'title': title, 'formats': formats})
            result.append(d)

        return result

    def _extract_entry_item(self, html, content_path):
        contents = self._extract_content(html, content_path)
        if contents is None:
            return contents

        if len(contents) > 1:
            raise ExtractorError('Got more than one entry')
        result = contents[0]
        result['authors'] = self._extract_authors(html)

        return result

    def _extract_session(self, html, content_path):
        contents = self._extract_content(html, content_path)
        if contents is None:
            return contents

        session_meta = {
            'session_code': self._extract_session_code(html),
            'session_day': self._extract_session_day(html),
            'session_room': self._extract_session_room(html),
            'session_speakers': self._extract_session_speakers(html),
        }

        for content in contents:
            content.update(session_meta)

        return self.playlist_result(contents)

    def _extract_list(self, content_path):
        rss = self._download_xml(self._RSS_URL % content_path, content_path, 'Downloading RSS')
        entries = [self.url_result(session_url.text, 'Channel9')
                   for session_url in rss.findall('./channel/item/link')]
        title_text = rss.find('./channel/title').text
        return self.playlist_result(entries, content_path, title_text)

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        content_path = mobj.group('contentpath')

        webpage = self._download_webpage(url, content_path, 'Downloading web page')

        page_type_m = re.search(r'<meta name="WT.entryid" content="(?P<pagetype>[^:]+)[^"]+"/>', webpage)
        if page_type_m is not None:
            page_type = page_type_m.group('pagetype')
            if page_type == 'Entry':      # Any 'item'-like page, may contain downloadable content
                return self._extract_entry_item(webpage, content_path)
            elif page_type == 'Session':  # Event session page, may contain downloadable content
                return self._extract_session(webpage, content_path)
            elif page_type == 'Event':
                return self._extract_list(content_path)
            else:
                raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True)

        else:  # Assuming list
            return self._extract_list(content_path)
Commit	Line	Data
adc267ee	1	from __future__ import unicode_literals
df537474	2
	3	import re
	4
	5	from .common import InfoExtractor
1db82381 S	6	from ..utils import (
	7	ExtractorError,
	8	parse_filesize,
	9	qualities,
	10	)
df537474	11
5f6a1245	12
df537474	13	class Channel9IE(InfoExtractor):
	14	'''
	15	Common extractor for channel9.msdn.com.
	16
	17	The type of provided URL (video or playlist) is determined according to
	18	meta Search.PageType from web page HTML rather than URL itself, as it is
adc267ee	19	not always possible to do.
df537474	20	'''
adc267ee	21	IE_DESC = 'Channel 9'
adc267ee	22	IE_NAME = 'channel9'
79bc27b5	23	_VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
df537474	24
	25	_TESTS = [
	26	{
adc267ee	27	'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
adc267ee	28	'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
adc267ee	29	'info_dict': {
79bc27b5 S	30	'id': 'Events/TechEd/Australia/2013/KOS002',
79bc27b5 S	31	'ext': 'mp4',
adc267ee	32	'title': 'Developer Kick-Off Session: Stuff We Love',
	33	'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
	34	'duration': 4576,
fafc7950	35	'thumbnail': 're:http://.*\.jpg',
adc267ee	36	'session_code': 'KOS002',
	37	'session_day': 'Day 1',
	38	'session_room': 'Arena 1A',
5f6a1245	39	'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen'],
df537474	40	},
	41	},
	42	{
adc267ee	43	'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
adc267ee	44	'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
adc267ee	45	'info_dict': {
79bc27b5 S	46	'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
79bc27b5 S	47	'ext': 'mp4',
adc267ee	48	'title': 'Self-service BI with Power BI - nuclear testing',
	49	'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
	50	'duration': 1540,
fafc7950	51	'thumbnail': 're:http://.*\.jpg',
5f6a1245	52	'authors': ['Mike Wilmot'],
df537474	53	},
a13d06de S	54	},
	55	{
	56	# low quality mp4 is best
	57	'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
	58	'info_dict': {
	59	'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
	60	'ext': 'mp4',
	61	'title': 'Ranges for the Standard Library',
	62	'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d',
	63	'duration': 5646,
	64	'thumbnail': 're:http://.*\.jpg',
	65	},
	66	'params': {
	67	'skip_download': True,
	68	},
df537474	69	}
	70	]
	71
	72	_RSS_URL = 'http://channel9.msdn.com/%s/RSS'
df537474	73
df537474	74	def _formats_from_html(self, html):
	75	FORMAT_REGEX = r'''
	76	(?x)
	77	<a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
	78	<span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
	79	(?:<div\s+class="popup\s+rounded">\s*
	80	<h3>File\s+size</h3>\s(?P<filesize>.?)\s*
	81	</div>)? # File size part may be missing
	82	'''
1db82381 S	83	quality = qualities((
	84	'MP3', 'MP4',
	85	'Low Quality WMV', 'Low Quality MP4',
	86	'Mid Quality WMV', 'Mid Quality MP4',
	87	'High Quality WMV', 'High Quality MP4'))
a1b92edb PH	88	formats = [{
	89	'url': x.group('url'),
	90	'format_id': x.group('quality'),
	91	'format_note': x.group('note'),
adc267ee	92	'format': '%s (%s)' % (x.group('quality'), x.group('note')),
1db82381 S	93	'filesize_approx': parse_filesize(x.group('filesize')),
1db82381 S	94	'quality': quality(x.group('quality')),
a1b92edb	95	'vcodec': 'none' if x.group('note') == 'Audio only' else None,
1db82381	96	} for x in list(re.finditer(FORMAT_REGEX, html))]
a1b92edb PH	97
	98	self._sort_formats(formats)
	99
df537474	100	return formats
df537474	101
df537474	102	def _extract_title(self, html):
adc267ee	103	title = self._html_search_meta('title', html, 'title')
a316a83d	104	if title is None:
df537474	105	title = self._og_search_title(html)
adc267ee	106	TITLE_SUFFIX = ' (Channel 9)'
df537474	107	if title is not None and title.endswith(TITLE_SUFFIX):
	108	title = title[:-len(TITLE_SUFFIX)]
	109	return title
	110
	111	def _extract_description(self, html):
	112	DESCRIPTION_REGEX = r'''(?sx)
	113	<div\s+class="entry-content">\s*
	114	<div\s+id="entry-body">\s*
	115	(?P<description>.+?)\s*
	116	</div>\s*
	117	</div>
	118	'''
	119	m = re.search(DESCRIPTION_REGEX, html)
	120	if m is not None:
	121	return m.group('description')
adc267ee	122	return self._html_search_meta('description', html, 'description')
df537474	123
df537474	124	def _extract_duration(self, html):
a316a83d	125	m = re.search(r'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
df537474	126	return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
	127
	128	def _extract_slides(self, html):
	129	m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
	130	return m.group('slidesurl') if m is not None else None
	131
	132	def _extract_zip(self, html):
	133	m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
	134	return m.group('zipurl') if m is not None else None
	135
	136	def _extract_avg_rating(self, html):
	137	m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
	138	return float(m.group('avgrating')) if m is not None else 0
	139
	140	def _extract_rating_count(self, html):
	141	m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
	142	return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
	143
	144	def _extract_view_count(self, html):
	145	m = re.search(r'<li class="views">\s<span class="count">(?P<viewcount>[^<]+)</span> Views\s</li>', html)
	146	return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
	147
	148	def _extract_comment_count(self, html):
	149	m = re.search(r'<li class="comments">\s<a href="#comments">\s<span class="count">(?P<commentcount>[^<]+)</span> Comments\s</a>\s</li>', html)
	150	return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
	151
	152	def _fix_count(self, count):
	153	return int(str(count).replace(',', '')) if count is not None else None
	154
	155	def _extract_authors(self, html):
	156	m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
	157	if m is None:
	158	return None
	159	return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
	160
	161	def _extract_session_code(self, html):
	162	m = re.search(r'<li class="code">\s(?P<code>.+?)\s</li>', html)
	163	return m.group('code') if m is not None else None
	164
	165	def _extract_session_day(self, html):
	166	m = re.search(r'<li class="day">\s<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s</li>', html)
506e261d	167	return m.group('day').strip() if m is not None else None
df537474	168
	169	def _extract_session_room(self, html):
	170	m = re.search(r'<li class="room">\s(?P<room>.+?)\s</li>', html)
	171	return m.group('room') if m is not None else None
	172
	173	def _extract_session_speakers(self, html):
	174	return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
	175
	176	def _extract_content(self, html, content_path):
a316a83d	177	# Look for downloadable content
df537474	178	formats = self._formats_from_html(html)
	179	slides = self._extract_slides(html)
	180	zip_ = self._extract_zip(html)
	181
	182	# Nothing to download
	183	if len(formats) == 0 and slides is None and zip_ is None:
adc267ee	184	self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path)
df537474	185	return
	186
	187	# Extract meta
	188	title = self._extract_title(html)
	189	description = self._extract_description(html)
	190	thumbnail = self._og_search_thumbnail(html)
	191	duration = self._extract_duration(html)
	192	avg_rating = self._extract_avg_rating(html)
	193	rating_count = self._extract_rating_count(html)
	194	view_count = self._extract_view_count(html)
	195	comment_count = self._extract_comment_count(html)
	196
b74e86f4 PH	197	common = {
	198	'_type': 'video',
	199	'id': content_path,
	200	'description': description,
	201	'thumbnail': thumbnail,
	202	'duration': duration,
	203	'avg_rating': avg_rating,
	204	'rating_count': rating_count,
	205	'view_count': view_count,
	206	'comment_count': comment_count,
	207	}
df537474	208
	209	result = []
	210
	211	if slides is not None:
	212	d = common.copy()
5f6a1245	213	d.update({'title': title + '-Slides', 'url': slides})
df537474	214	result.append(d)
	215
	216	if zip_ is not None:
	217	d = common.copy()
5f6a1245	218	d.update({'title': title + '-Zip', 'url': zip_})
df537474	219	result.append(d)
	220
	221	if len(formats) > 0:
	222	d = common.copy()
5f6a1245	223	d.update({'title': title, 'formats': formats})
df537474	224	result.append(d)
	225
	226	return result
	227
	228	def _extract_entry_item(self, html, content_path):
	229	contents = self._extract_content(html, content_path)
	230	if contents is None:
	231	return contents
	232
b30c4992 JMF	233	if len(contents) > 1:
	234	raise ExtractorError('Got more than one entry')
	235	result = contents[0]
	236	result['authors'] = self._extract_authors(html)
df537474	237
b30c4992	238	return result
df537474	239
	240	def _extract_session(self, html, content_path):
	241	contents = self._extract_content(html, content_path)
	242	if contents is None:
	243	return contents
	244
025f30ba PH	245	session_meta = {
	246	'session_code': self._extract_session_code(html),
	247	'session_day': self._extract_session_day(html),
	248	'session_room': self._extract_session_room(html),
	249	'session_speakers': self._extract_session_speakers(html),
	250	}
df537474	251
	252	for content in contents:
	253	content.update(session_meta)
	254
025f30ba	255	return self.playlist_result(contents)
df537474	256
df537474	257	def _extract_list(self, content_path):
adc267ee	258	rss = self._download_xml(self._RSS_URL % content_path, content_path, 'Downloading RSS')
4d2ebb6b	259	entries = [self.url_result(session_url.text, 'Channel9')
	260	for session_url in rss.findall('./channel/item/link')]
	261	title_text = rss.find('./channel/title').text
	262	return self.playlist_result(entries, content_path, title_text)
df537474	263
	264	def _real_extract(self, url):
	265	mobj = re.match(self._VALID_URL, url)
	266	content_path = mobj.group('contentpath')
	267
adc267ee	268	webpage = self._download_webpage(url, content_path, 'Downloading web page')
df537474	269
a316a83d	270	page_type_m = re.search(r'<meta name="WT.entryid" content="(?P<pagetype>[^:]+)[^"]+"/>', webpage)
	271	if page_type_m is not None:
	272	page_type = page_type_m.group('pagetype')
	273	if page_type == 'Entry': # Any 'item'-like page, may contain downloadable content
	274	return self._extract_entry_item(webpage, content_path)
	275	elif page_type == 'Session': # Event session page, may contain downloadable content
	276	return self._extract_session(webpage, content_path)
	277	elif page_type == 'Event':
	278	return self._extract_list(content_path)
	279	else:
	280	raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True)
	281
5f6a1245	282	else: # Assuming list
df537474	283	return self._extract_list(content_path)