[yt-dlp.git] / youtube_dl / extractor / channel9.py

from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..utils import ExtractorError


class Channel9IE(InfoExtractor):
    '''
    Common extractor for channel9.msdn.com.

    The type of provided URL (video or playlist) is determined according to
    meta Search.PageType from web page HTML rather than URL itself, as it is
    not always possible to do.
    '''
    IE_DESC = 'Channel 9'
    IE_NAME = 'channel9'
    _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'

    _TESTS = [
        {
            'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
            'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
            'info_dict': {
                'id': 'Events/TechEd/Australia/2013/KOS002',
                'ext': 'mp4',
                'title': 'Developer Kick-Off Session: Stuff We Love',
                'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
                'duration': 4576,
                'thumbnail': 'http://video.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
                'session_code': 'KOS002',
                'session_day': 'Day 1',
                'session_room': 'Arena 1A',
                'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen'],
            },
        },
        {
            'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
            'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
            'info_dict': {
                'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
                'ext': 'mp4',
                'title': 'Self-service BI with Power BI - nuclear testing',
                'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
                'duration': 1540,
                'thumbnail': 'http://video.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
                'authors': ['Mike Wilmot'],
            },
        }
    ]

    _RSS_URL = 'http://channel9.msdn.com/%s/RSS'

    # Sorted by quality
    _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']

    def _restore_bytes(self, formatted_size):
        if not formatted_size:
            return 0
        m = re.match(r'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size)
        if not m:
            return 0
        units = m.group('units')
        try:
            exponent = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'].index(units.upper())
        except ValueError:
            return 0
        size = float(m.group('size'))
        return int(size * (1024 ** exponent))

    def _formats_from_html(self, html):
        FORMAT_REGEX = r'''
            (?x)
            <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
            <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
            (?:<div\s+class="popup\s+rounded">\s*
            <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
            </div>)?                                                # File size part may be missing
        '''
        # Extract known formats
        formats = [{
            'url': x.group('url'),
            'format_id': x.group('quality'),
            'format_note': x.group('note'),
            'format': '%s (%s)' % (x.group('quality'), x.group('note')),
            'filesize': self._restore_bytes(x.group('filesize')),  # File size is approximate
            'preference': self._known_formats.index(x.group('quality')),
            'vcodec': 'none' if x.group('note') == 'Audio only' else None,
        } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]

        self._sort_formats(formats)

        return formats

    def _extract_title(self, html):
        title = self._html_search_meta('title', html, 'title')
        if title is None:
            title = self._og_search_title(html)
            TITLE_SUFFIX = ' (Channel 9)'
            if title is not None and title.endswith(TITLE_SUFFIX):
                title = title[:-len(TITLE_SUFFIX)]
        return title

    def _extract_description(self, html):
        DESCRIPTION_REGEX = r'''(?sx)
            <div\s+class="entry-content">\s*
            <div\s+id="entry-body">\s*
            (?P<description>.+?)\s*
            </div>\s*
            </div>
        '''
        m = re.search(DESCRIPTION_REGEX, html)
        if m is not None:
            return m.group('description')
        return self._html_search_meta('description', html, 'description')

    def _extract_duration(self, html):
        m = re.search(r'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
        return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None

    def _extract_slides(self, html):
        m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
        return m.group('slidesurl') if m is not None else None

    def _extract_zip(self, html):
        m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
        return m.group('zipurl') if m is not None else None

    def _extract_avg_rating(self, html):
        m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
        return float(m.group('avgrating')) if m is not None else 0

    def _extract_rating_count(self, html):
        m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
        return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0

    def _extract_view_count(self, html):
        m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
        return int(self._fix_count(m.group('viewcount'))) if m is not None else 0

    def _extract_comment_count(self, html):
        m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
        return int(self._fix_count(m.group('commentcount'))) if m is not None else 0

    def _fix_count(self, count):
        return int(str(count).replace(',', '')) if count is not None else None

    def _extract_authors(self, html):
        m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
        if m is None:
            return None
        return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))

    def _extract_session_code(self, html):
        m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
        return m.group('code') if m is not None else None

    def _extract_session_day(self, html):
        m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
        return m.group('day').strip() if m is not None else None

    def _extract_session_room(self, html):
        m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
        return m.group('room') if m is not None else None

    def _extract_session_speakers(self, html):
        return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)

    def _extract_content(self, html, content_path):
        # Look for downloadable content
        formats = self._formats_from_html(html)
        slides = self._extract_slides(html)
        zip_ = self._extract_zip(html)

        # Nothing to download
        if len(formats) == 0 and slides is None and zip_ is None:
            self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path)
            return

        # Extract meta
        title = self._extract_title(html)
        description = self._extract_description(html)
        thumbnail = self._og_search_thumbnail(html)
        duration = self._extract_duration(html)
        avg_rating = self._extract_avg_rating(html)
        rating_count = self._extract_rating_count(html)
        view_count = self._extract_view_count(html)
        comment_count = self._extract_comment_count(html)

        common = {
            '_type': 'video',
            'id': content_path,
            'description': description,
            'thumbnail': thumbnail,
            'duration': duration,
            'avg_rating': avg_rating,
            'rating_count': rating_count,
            'view_count': view_count,
            'comment_count': comment_count,
        }

        result = []

        if slides is not None:
            d = common.copy()
            d.update({'title': title + '-Slides', 'url': slides})
            result.append(d)

        if zip_ is not None:
            d = common.copy()
            d.update({'title': title + '-Zip', 'url': zip_})
            result.append(d)

        if len(formats) > 0:
            d = common.copy()
            d.update({'title': title, 'formats': formats})
            result.append(d)

        return result

    def _extract_entry_item(self, html, content_path):
        contents = self._extract_content(html, content_path)
        if contents is None:
            return contents

        if len(contents) > 1:
            raise ExtractorError('Got more than one entry')
        result = contents[0]
        result['authors'] = self._extract_authors(html)

        return result

    def _extract_session(self, html, content_path):
        contents = self._extract_content(html, content_path)
        if contents is None:
            return contents

        session_meta = {
            'session_code': self._extract_session_code(html),
            'session_day': self._extract_session_day(html),
            'session_room': self._extract_session_room(html),
            'session_speakers': self._extract_session_speakers(html),
        }

        for content in contents:
            content.update(session_meta)

        return self.playlist_result(contents)

    def _extract_list(self, content_path):
        rss = self._download_xml(self._RSS_URL % content_path, content_path, 'Downloading RSS')
        entries = [self.url_result(session_url.text, 'Channel9')
                   for session_url in rss.findall('./channel/item/link')]
        title_text = rss.find('./channel/title').text
        return self.playlist_result(entries, content_path, title_text)

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        content_path = mobj.group('contentpath')

        webpage = self._download_webpage(url, content_path, 'Downloading web page')

        page_type_m = re.search(r'<meta name="WT.entryid" content="(?P<pagetype>[^:]+)[^"]+"/>', webpage)
        if page_type_m is not None:
            page_type = page_type_m.group('pagetype')
            if page_type == 'Entry':      # Any 'item'-like page, may contain downloadable content
                return self._extract_entry_item(webpage, content_path)
            elif page_type == 'Session':  # Event session page, may contain downloadable content
                return self._extract_session(webpage, content_path)
            elif page_type == 'Event':
                return self._extract_list(content_path)
            else:
                raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True)

        else:  # Assuming list
            return self._extract_list(content_path)
Commit	Line	Data
adc267ee	1	from __future__ import unicode_literals
df537474	2
	3	import re
	4
	5	from .common import InfoExtractor
4d2ebb6b	6	from ..utils import ExtractorError
df537474	7
5f6a1245	8
df537474	9	class Channel9IE(InfoExtractor):
	10	'''
	11	Common extractor for channel9.msdn.com.
	12
	13	The type of provided URL (video or playlist) is determined according to
	14	meta Search.PageType from web page HTML rather than URL itself, as it is
adc267ee	15	not always possible to do.
df537474	16	'''
adc267ee	17	IE_DESC = 'Channel 9'
adc267ee	18	IE_NAME = 'channel9'
79bc27b5	19	_VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
df537474	20
	21	_TESTS = [
	22	{
adc267ee	23	'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
adc267ee	24	'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
adc267ee	25	'info_dict': {
79bc27b5 S	26	'id': 'Events/TechEd/Australia/2013/KOS002',
79bc27b5 S	27	'ext': 'mp4',
adc267ee	28	'title': 'Developer Kick-Off Session: Stuff We Love',
	29	'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
	30	'duration': 4576,
a316a83d	31	'thumbnail': 'http://video.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
adc267ee	32	'session_code': 'KOS002',
	33	'session_day': 'Day 1',
	34	'session_room': 'Arena 1A',
5f6a1245	35	'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen'],
df537474	36	},
	37	},
	38	{
adc267ee	39	'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
adc267ee	40	'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
adc267ee	41	'info_dict': {
79bc27b5 S	42	'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
79bc27b5 S	43	'ext': 'mp4',
adc267ee	44	'title': 'Self-service BI with Power BI - nuclear testing',
	45	'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
	46	'duration': 1540,
a316a83d	47	'thumbnail': 'http://video.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
5f6a1245	48	'authors': ['Mike Wilmot'],
df537474	49	},
	50	}
	51	]
	52
	53	_RSS_URL = 'http://channel9.msdn.com/%s/RSS'
df537474	54
	55	# Sorted by quality
	56	_known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']
	57
	58	def _restore_bytes(self, formatted_size):
	59	if not formatted_size:
	60	return 0
	61	m = re.match(r'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size)
	62	if not m:
	63	return 0
	64	units = m.group('units')
	65	try:
adc267ee	66	exponent = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'].index(units.upper())
df537474	67	except ValueError:
	68	return 0
	69	size = float(m.group('size'))
	70	return int(size * (1024 ** exponent))
	71
	72	def _formats_from_html(self, html):
	73	FORMAT_REGEX = r'''
	74	(?x)
	75	<a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
	76	<span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
	77	(?:<div\s+class="popup\s+rounded">\s*
	78	<h3>File\s+size</h3>\s(?P<filesize>.?)\s*
	79	</div>)? # File size part may be missing
	80	'''
	81	# Extract known formats
a1b92edb PH	82	formats = [{
	83	'url': x.group('url'),
	84	'format_id': x.group('quality'),
	85	'format_note': x.group('note'),
adc267ee	86	'format': '%s (%s)' % (x.group('quality'), x.group('note')),
5f6a1245	87	'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate
a1b92edb PH	88	'preference': self._known_formats.index(x.group('quality')),
	89	'vcodec': 'none' if x.group('note') == 'Audio only' else None,
	90	} for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
	91
	92	self._sort_formats(formats)
	93
df537474	94	return formats
df537474	95
df537474	96	def _extract_title(self, html):
adc267ee	97	title = self._html_search_meta('title', html, 'title')
a316a83d	98	if title is None:
df537474	99	title = self._og_search_title(html)
adc267ee	100	TITLE_SUFFIX = ' (Channel 9)'
df537474	101	if title is not None and title.endswith(TITLE_SUFFIX):
	102	title = title[:-len(TITLE_SUFFIX)]
	103	return title
	104
	105	def _extract_description(self, html):
	106	DESCRIPTION_REGEX = r'''(?sx)
	107	<div\s+class="entry-content">\s*
	108	<div\s+id="entry-body">\s*
	109	(?P<description>.+?)\s*
	110	</div>\s*
	111	</div>
	112	'''
	113	m = re.search(DESCRIPTION_REGEX, html)
	114	if m is not None:
	115	return m.group('description')
adc267ee	116	return self._html_search_meta('description', html, 'description')
df537474	117
df537474	118	def _extract_duration(self, html):
a316a83d	119	m = re.search(r'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
df537474	120	return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
	121
	122	def _extract_slides(self, html):
	123	m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
	124	return m.group('slidesurl') if m is not None else None
	125
	126	def _extract_zip(self, html):
	127	m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
	128	return m.group('zipurl') if m is not None else None
	129
	130	def _extract_avg_rating(self, html):
	131	m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
	132	return float(m.group('avgrating')) if m is not None else 0
	133
	134	def _extract_rating_count(self, html):
	135	m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
	136	return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
	137
	138	def _extract_view_count(self, html):
	139	m = re.search(r'<li class="views">\s<span class="count">(?P<viewcount>[^<]+)</span> Views\s</li>', html)
	140	return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
	141
	142	def _extract_comment_count(self, html):
	143	m = re.search(r'<li class="comments">\s<a href="#comments">\s<span class="count">(?P<commentcount>[^<]+)</span> Comments\s</a>\s</li>', html)
	144	return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
	145
	146	def _fix_count(self, count):
	147	return int(str(count).replace(',', '')) if count is not None else None
	148
	149	def _extract_authors(self, html):
	150	m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
	151	if m is None:
	152	return None
	153	return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
	154
	155	def _extract_session_code(self, html):
	156	m = re.search(r'<li class="code">\s(?P<code>.+?)\s</li>', html)
	157	return m.group('code') if m is not None else None
	158
	159	def _extract_session_day(self, html):
	160	m = re.search(r'<li class="day">\s<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s</li>', html)
506e261d	161	return m.group('day').strip() if m is not None else None
df537474	162
	163	def _extract_session_room(self, html):
	164	m = re.search(r'<li class="room">\s(?P<room>.+?)\s</li>', html)
	165	return m.group('room') if m is not None else None
	166
	167	def _extract_session_speakers(self, html):
	168	return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
	169
	170	def _extract_content(self, html, content_path):
a316a83d	171	# Look for downloadable content
df537474	172	formats = self._formats_from_html(html)
	173	slides = self._extract_slides(html)
	174	zip_ = self._extract_zip(html)
	175
	176	# Nothing to download
	177	if len(formats) == 0 and slides is None and zip_ is None:
adc267ee	178	self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path)
df537474	179	return
	180
	181	# Extract meta
	182	title = self._extract_title(html)
	183	description = self._extract_description(html)
	184	thumbnail = self._og_search_thumbnail(html)
	185	duration = self._extract_duration(html)
	186	avg_rating = self._extract_avg_rating(html)
	187	rating_count = self._extract_rating_count(html)
	188	view_count = self._extract_view_count(html)
	189	comment_count = self._extract_comment_count(html)
	190
b74e86f4 PH	191	common = {
	192	'_type': 'video',
	193	'id': content_path,
	194	'description': description,
	195	'thumbnail': thumbnail,
	196	'duration': duration,
	197	'avg_rating': avg_rating,
	198	'rating_count': rating_count,
	199	'view_count': view_count,
	200	'comment_count': comment_count,
	201	}
df537474	202
	203	result = []
	204
	205	if slides is not None:
	206	d = common.copy()
5f6a1245	207	d.update({'title': title + '-Slides', 'url': slides})
df537474	208	result.append(d)
	209
	210	if zip_ is not None:
	211	d = common.copy()
5f6a1245	212	d.update({'title': title + '-Zip', 'url': zip_})
df537474	213	result.append(d)
	214
	215	if len(formats) > 0:
	216	d = common.copy()
5f6a1245	217	d.update({'title': title, 'formats': formats})
df537474	218	result.append(d)
	219
	220	return result
	221
	222	def _extract_entry_item(self, html, content_path):
	223	contents = self._extract_content(html, content_path)
	224	if contents is None:
	225	return contents
	226
b30c4992 JMF	227	if len(contents) > 1:
	228	raise ExtractorError('Got more than one entry')
	229	result = contents[0]
	230	result['authors'] = self._extract_authors(html)
df537474	231
b30c4992	232	return result
df537474	233
	234	def _extract_session(self, html, content_path):
	235	contents = self._extract_content(html, content_path)
	236	if contents is None:
	237	return contents
	238
025f30ba PH	239	session_meta = {
	240	'session_code': self._extract_session_code(html),
	241	'session_day': self._extract_session_day(html),
	242	'session_room': self._extract_session_room(html),
	243	'session_speakers': self._extract_session_speakers(html),
	244	}
df537474	245
	246	for content in contents:
	247	content.update(session_meta)
	248
025f30ba	249	return self.playlist_result(contents)
df537474	250
df537474	251	def _extract_list(self, content_path):
adc267ee	252	rss = self._download_xml(self._RSS_URL % content_path, content_path, 'Downloading RSS')
4d2ebb6b	253	entries = [self.url_result(session_url.text, 'Channel9')
	254	for session_url in rss.findall('./channel/item/link')]
	255	title_text = rss.find('./channel/title').text
	256	return self.playlist_result(entries, content_path, title_text)
df537474	257
	258	def _real_extract(self, url):
	259	mobj = re.match(self._VALID_URL, url)
	260	content_path = mobj.group('contentpath')
	261
adc267ee	262	webpage = self._download_webpage(url, content_path, 'Downloading web page')
df537474	263
a316a83d	264	page_type_m = re.search(r'<meta name="WT.entryid" content="(?P<pagetype>[^:]+)[^"]+"/>', webpage)
	265	if page_type_m is not None:
	266	page_type = page_type_m.group('pagetype')
	267	if page_type == 'Entry': # Any 'item'-like page, may contain downloadable content
	268	return self._extract_entry_item(webpage, content_path)
	269	elif page_type == 'Session': # Event session page, may contain downloadable content
	270	return self._extract_session(webpage, content_path)
	271	elif page_type == 'Event':
	272	return self._extract_list(content_path)
	273	else:
	274	raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True)
	275
5f6a1245	276	else: # Assuming list
df537474	277	return self._extract_list(content_path)