[yt-dlp.git] / youtube_dl / extractor / channel9.py

from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..utils import ExtractorError

class Channel9IE(InfoExtractor):
    '''
    Common extractor for channel9.msdn.com.

    The type of provided URL (video or playlist) is determined according to
    meta Search.PageType from web page HTML rather than URL itself, as it is
    not always possible to do.
    '''
    IE_DESC = 'Channel 9'
    IE_NAME = 'channel9'
    _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'

    _TESTS = [
        {
            'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
            'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
            'info_dict': {
                'id': 'Events/TechEd/Australia/2013/KOS002',
                'ext': 'mp4',
                'title': 'Developer Kick-Off Session: Stuff We Love',
                'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
                'duration': 4576,
                'thumbnail': 'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
                'session_code': 'KOS002',
                'session_day': 'Day 1',
                'session_room': 'Arena 1A',
                'session_speakers': [ 'Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen' ],
            },
        },
        {
            'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
            'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
            'info_dict': {
                'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
                'ext': 'mp4',
                'title': 'Self-service BI with Power BI - nuclear testing',
                'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
                'duration': 1540,
                'thumbnail': 'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
                'authors': [ 'Mike Wilmot' ],
            },
        }
    ]

    _RSS_URL = 'http://channel9.msdn.com/%s/RSS'

    # Sorted by quality
    _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']

    def _restore_bytes(self, formatted_size):
        if not formatted_size:
            return 0
        m = re.match(r'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size)
        if not m:
            return 0
        units = m.group('units')
        try:
            exponent = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'].index(units.upper())
        except ValueError:
            return 0
        size = float(m.group('size'))
        return int(size * (1024 ** exponent))

    def _formats_from_html(self, html):
        FORMAT_REGEX = r'''
            (?x)
            <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
            <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
            (?:<div\s+class="popup\s+rounded">\s*
            <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
            </div>)?                                                # File size part may be missing
        '''
        # Extract known formats
        formats = [{
            'url': x.group('url'),
            'format_id': x.group('quality'),
            'format_note': x.group('note'),
            'format': '%s (%s)' % (x.group('quality'), x.group('note')),
            'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate
            'preference': self._known_formats.index(x.group('quality')),
            'vcodec': 'none' if x.group('note') == 'Audio only' else None,
        } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]

        self._sort_formats(formats)

        return formats

    def _extract_title(self, html):
        title = self._html_search_meta('title', html, 'title')
        if title is None:           
            title = self._og_search_title(html)
            TITLE_SUFFIX = ' (Channel 9)'
            if title is not None and title.endswith(TITLE_SUFFIX):
                title = title[:-len(TITLE_SUFFIX)]
        return title

    def _extract_description(self, html):
        DESCRIPTION_REGEX = r'''(?sx)
            <div\s+class="entry-content">\s*
            <div\s+id="entry-body">\s*
            (?P<description>.+?)\s*
            </div>\s*
            </div>
        '''
        m = re.search(DESCRIPTION_REGEX, html)
        if m is not None:
            return m.group('description')
        return self._html_search_meta('description', html, 'description')

    def _extract_duration(self, html):
        m = re.search(r'data-video_duration="(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
        return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None

    def _extract_slides(self, html):
        m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
        return m.group('slidesurl') if m is not None else None

    def _extract_zip(self, html):
        m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
        return m.group('zipurl') if m is not None else None

    def _extract_avg_rating(self, html):
        m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
        return float(m.group('avgrating')) if m is not None else 0

    def _extract_rating_count(self, html):
        m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
        return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0

    def _extract_view_count(self, html):
        m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
        return int(self._fix_count(m.group('viewcount'))) if m is not None else 0

    def _extract_comment_count(self, html):
        m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
        return int(self._fix_count(m.group('commentcount'))) if m is not None else 0

    def _fix_count(self, count):
        return int(str(count).replace(',', '')) if count is not None else None

    def _extract_authors(self, html):
        m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
        if m is None:
            return None
        return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))

    def _extract_session_code(self, html):
        m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
        return m.group('code') if m is not None else None

    def _extract_session_day(self, html):
        m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
        return m.group('day') if m is not None else None

    def _extract_session_room(self, html):
        m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
        return m.group('room') if m is not None else None

    def _extract_session_speakers(self, html):
        return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)

    def _extract_content(self, html, content_path):
        # Look for downloadable content        
        formats = self._formats_from_html(html)
        slides = self._extract_slides(html)
        zip_ = self._extract_zip(html)

        # Nothing to download
        if len(formats) == 0 and slides is None and zip_ is None:
            self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path)
            return

        # Extract meta
        title = self._extract_title(html)
        description = self._extract_description(html)
        thumbnail = self._og_search_thumbnail(html)
        duration = self._extract_duration(html)
        avg_rating = self._extract_avg_rating(html)
        rating_count = self._extract_rating_count(html)
        view_count = self._extract_view_count(html)
        comment_count = self._extract_comment_count(html)

        common = {'_type': 'video',
                  'id': content_path,
                  'description': description,
                  'thumbnail': thumbnail,
                  'duration': duration,
                  'avg_rating': avg_rating,
                  'rating_count': rating_count,
                  'view_count': view_count,
                  'comment_count': comment_count,
                }

        result = []

        if slides is not None:
            d = common.copy()
            d.update({ 'title': title + '-Slides', 'url': slides })
            result.append(d)

        if zip_ is not None:
            d = common.copy()
            d.update({ 'title': title + '-Zip', 'url': zip_ })
            result.append(d)

        if len(formats) > 0:
            d = common.copy()
            d.update({ 'title': title, 'formats': formats })
            result.append(d)

        return result

    def _extract_entry_item(self, html, content_path):
        contents = self._extract_content(html, content_path)
        if contents is None:
            return contents

        authors = self._extract_authors(html)

        for content in contents:
            content['authors'] = authors

        return contents

    def _extract_session(self, html, content_path):
        contents = self._extract_content(html, content_path)
        if contents is None:
            return contents

        session_meta = {'session_code': self._extract_session_code(html),
                        'session_day': self._extract_session_day(html),
                        'session_room': self._extract_session_room(html),
                        'session_speakers': self._extract_session_speakers(html),
                        }

        for content in contents:
            content.update(session_meta)

        return contents

    def _extract_list(self, content_path):
        rss = self._download_xml(self._RSS_URL % content_path, content_path, 'Downloading RSS')
        entries = [self.url_result(session_url.text, 'Channel9')
                   for session_url in rss.findall('./channel/item/link')]
        title_text = rss.find('./channel/title').text
        return self.playlist_result(entries, content_path, title_text)

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        content_path = mobj.group('contentpath')

        webpage = self._download_webpage(url, content_path, 'Downloading web page')

        page_type_m = re.search(r'<meta name="Search.PageType" content="(?P<pagetype>[^"]+)"/>', webpage)
        if page_type_m is None:
            raise ExtractorError('Search.PageType not found, don\'t know how to process this page', expected=True)

        page_type = page_type_m.group('pagetype')
        if page_type == 'List':         # List page, may contain list of 'item'-like objects
            return self._extract_list(content_path)
        elif page_type == 'Entry.Item': # Any 'item'-like page, may contain downloadable content
            return self._extract_entry_item(webpage, content_path)
        elif page_type == 'Session':    # Event session page, may contain downloadable content
            return self._extract_session(webpage, content_path)
        else:
            raise ExtractorError('Unexpected Search.PageType %s' % page_type, expected=True)
Commit	Line	Data
adc267ee	1	from __future__ import unicode_literals
df537474	2
	3	import re
	4
	5	from .common import InfoExtractor
4d2ebb6b	6	from ..utils import ExtractorError
df537474	7
	8	class Channel9IE(InfoExtractor):
	9	'''
	10	Common extractor for channel9.msdn.com.
	11
	12	The type of provided URL (video or playlist) is determined according to
	13	meta Search.PageType from web page HTML rather than URL itself, as it is
adc267ee	14	not always possible to do.
df537474	15	'''
adc267ee	16	IE_DESC = 'Channel 9'
adc267ee	17	IE_NAME = 'channel9'
79bc27b5	18	_VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
df537474	19
	20	_TESTS = [
	21	{
adc267ee	22	'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
adc267ee	23	'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
adc267ee	24	'info_dict': {
79bc27b5 S	25	'id': 'Events/TechEd/Australia/2013/KOS002',
79bc27b5 S	26	'ext': 'mp4',
adc267ee	27	'title': 'Developer Kick-Off Session: Stuff We Love',
	28	'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
	29	'duration': 4576,
	30	'thumbnail': 'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
	31	'session_code': 'KOS002',
	32	'session_day': 'Day 1',
	33	'session_room': 'Arena 1A',
	34	'session_speakers': [ 'Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen' ],
df537474	35	},
	36	},
	37	{
adc267ee	38	'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
adc267ee	39	'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
adc267ee	40	'info_dict': {
79bc27b5 S	41	'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
79bc27b5 S	42	'ext': 'mp4',
adc267ee	43	'title': 'Self-service BI with Power BI - nuclear testing',
	44	'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
	45	'duration': 1540,
	46	'thumbnail': 'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
	47	'authors': [ 'Mike Wilmot' ],
df537474	48	},
	49	}
	50	]
	51
	52	_RSS_URL = 'http://channel9.msdn.com/%s/RSS'
df537474	53
	54	# Sorted by quality
	55	_known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']
	56
	57	def _restore_bytes(self, formatted_size):
	58	if not formatted_size:
	59	return 0
	60	m = re.match(r'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size)
	61	if not m:
	62	return 0
	63	units = m.group('units')
	64	try:
adc267ee	65	exponent = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'].index(units.upper())
df537474	66	except ValueError:
	67	return 0
	68	size = float(m.group('size'))
	69	return int(size * (1024 ** exponent))
	70
	71	def _formats_from_html(self, html):
	72	FORMAT_REGEX = r'''
	73	(?x)
	74	<a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
	75	<span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
	76	(?:<div\s+class="popup\s+rounded">\s*
	77	<h3>File\s+size</h3>\s(?P<filesize>.?)\s*
	78	</div>)? # File size part may be missing
	79	'''
	80	# Extract known formats
a1b92edb PH	81	formats = [{
	82	'url': x.group('url'),
	83	'format_id': x.group('quality'),
	84	'format_note': x.group('note'),
adc267ee	85	'format': '%s (%s)' % (x.group('quality'), x.group('note')),
a1b92edb PH	86	'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate
	87	'preference': self._known_formats.index(x.group('quality')),
	88	'vcodec': 'none' if x.group('note') == 'Audio only' else None,
	89	} for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
	90
	91	self._sort_formats(formats)
	92
df537474	93	return formats
df537474	94
df537474	95	def _extract_title(self, html):
adc267ee	96	title = self._html_search_meta('title', html, 'title')
df537474	97	if title is None:
df537474	98	title = self._og_search_title(html)
adc267ee	99	TITLE_SUFFIX = ' (Channel 9)'
df537474	100	if title is not None and title.endswith(TITLE_SUFFIX):
	101	title = title[:-len(TITLE_SUFFIX)]
	102	return title
	103
	104	def _extract_description(self, html):
	105	DESCRIPTION_REGEX = r'''(?sx)
	106	<div\s+class="entry-content">\s*
	107	<div\s+id="entry-body">\s*
	108	(?P<description>.+?)\s*
	109	</div>\s*
	110	</div>
	111	'''
	112	m = re.search(DESCRIPTION_REGEX, html)
	113	if m is not None:
	114	return m.group('description')
adc267ee	115	return self._html_search_meta('description', html, 'description')
df537474	116
	117	def _extract_duration(self, html):
	118	m = re.search(r'data-video_duration="(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
	119	return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
	120
	121	def _extract_slides(self, html):
	122	m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
	123	return m.group('slidesurl') if m is not None else None
	124
	125	def _extract_zip(self, html):
	126	m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
	127	return m.group('zipurl') if m is not None else None
	128
	129	def _extract_avg_rating(self, html):
	130	m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
	131	return float(m.group('avgrating')) if m is not None else 0
	132
	133	def _extract_rating_count(self, html):
	134	m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
	135	return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
	136
	137	def _extract_view_count(self, html):
	138	m = re.search(r'<li class="views">\s<span class="count">(?P<viewcount>[^<]+)</span> Views\s</li>', html)
	139	return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
	140
	141	def _extract_comment_count(self, html):
	142	m = re.search(r'<li class="comments">\s<a href="#comments">\s<span class="count">(?P<commentcount>[^<]+)</span> Comments\s</a>\s</li>', html)
	143	return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
	144
	145	def _fix_count(self, count):
	146	return int(str(count).replace(',', '')) if count is not None else None
	147
	148	def _extract_authors(self, html):
	149	m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
	150	if m is None:
	151	return None
	152	return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
	153
	154	def _extract_session_code(self, html):
	155	m = re.search(r'<li class="code">\s(?P<code>.+?)\s</li>', html)
	156	return m.group('code') if m is not None else None
	157
	158	def _extract_session_day(self, html):
	159	m = re.search(r'<li class="day">\s<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s</li>', html)
	160	return m.group('day') if m is not None else None
	161
	162	def _extract_session_room(self, html):
	163	m = re.search(r'<li class="room">\s(?P<room>.+?)\s</li>', html)
	164	return m.group('room') if m is not None else None
	165
	166	def _extract_session_speakers(self, html):
	167	return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
	168
	169	def _extract_content(self, html, content_path):
	170	# Look for downloadable content
	171	formats = self._formats_from_html(html)
	172	slides = self._extract_slides(html)
	173	zip_ = self._extract_zip(html)
	174
	175	# Nothing to download
	176	if len(formats) == 0 and slides is None and zip_ is None:
adc267ee	177	self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path)
df537474	178	return
	179
	180	# Extract meta
	181	title = self._extract_title(html)
	182	description = self._extract_description(html)
	183	thumbnail = self._og_search_thumbnail(html)
	184	duration = self._extract_duration(html)
	185	avg_rating = self._extract_avg_rating(html)
	186	rating_count = self._extract_rating_count(html)
	187	view_count = self._extract_view_count(html)
	188	comment_count = self._extract_comment_count(html)
	189
	190	common = {'_type': 'video',
	191	'id': content_path,
	192	'description': description,
	193	'thumbnail': thumbnail,
	194	'duration': duration,
	195	'avg_rating': avg_rating,
	196	'rating_count': rating_count,
	197	'view_count': view_count,
	198	'comment_count': comment_count,
	199	}
	200
	201	result = []
	202
	203	if slides is not None:
	204	d = common.copy()
	205	d.update({ 'title': title + '-Slides', 'url': slides })
	206	result.append(d)
	207
	208	if zip_ is not None:
	209	d = common.copy()
	210	d.update({ 'title': title + '-Zip', 'url': zip_ })
	211	result.append(d)
	212
	213	if len(formats) > 0:
	214	d = common.copy()
	215	d.update({ 'title': title, 'formats': formats })
	216	result.append(d)
	217
	218	return result
	219
	220	def _extract_entry_item(self, html, content_path):
	221	contents = self._extract_content(html, content_path)
	222	if contents is None:
	223	return contents
	224
	225	authors = self._extract_authors(html)
	226
	227	for content in contents:
	228	content['authors'] = authors
	229
	230	return contents
	231
	232	def _extract_session(self, html, content_path):
	233	contents = self._extract_content(html, content_path)
	234	if contents is None:
	235	return contents
	236
	237	session_meta = {'session_code': self._extract_session_code(html),
	238	'session_day': self._extract_session_day(html),
	239	'session_room': self._extract_session_room(html),
	240	'session_speakers': self._extract_session_speakers(html),
	241	}
242
243	for content in contents:
244	content.update(session_meta)
245
246	return contents
247
df537474	248	def _extract_list(self, content_path):
adc267ee	249	rss = self._download_xml(self._RSS_URL % content_path, content_path, 'Downloading RSS')
4d2ebb6b	250	entries = [self.url_result(session_url.text, 'Channel9')
	251	for session_url in rss.findall('./channel/item/link')]
	252	title_text = rss.find('./channel/title').text
	253	return self.playlist_result(entries, content_path, title_text)
df537474	254
	255	def _real_extract(self, url):
	256	mobj = re.match(self._VALID_URL, url)
	257	content_path = mobj.group('contentpath')
	258
adc267ee	259	webpage = self._download_webpage(url, content_path, 'Downloading web page')
df537474	260
	261	page_type_m = re.search(r'<meta name="Search.PageType" content="(?P<pagetype>[^"]+)"/>', webpage)
	262	if page_type_m is None:
adc267ee	263	raise ExtractorError('Search.PageType not found, don\'t know how to process this page', expected=True)
df537474	264
	265	page_type = page_type_m.group('pagetype')
	266	if page_type == 'List': # List page, may contain list of 'item'-like objects
	267	return self._extract_list(content_path)
	268	elif page_type == 'Entry.Item': # Any 'item'-like page, may contain downloadable content
	269	return self._extract_entry_item(webpage, content_path)
	270	elif page_type == 'Session': # Event session page, may contain downloadable content
	271	return self._extract_session(webpage, content_path)
	272	else:
adc267ee	273	raise ExtractorError('Unexpected Search.PageType %s' % page_type, expected=True)