[yt-dlp.git] / youtube_dl / extractor / channel9.py

from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..utils import ExtractorError

class Channel9IE(InfoExtractor):
    '''
    Common extractor for channel9.msdn.com.

    The type of provided URL (video or playlist) is determined according to
    meta Search.PageType from web page HTML rather than URL itself, as it is
    not always possible to do.
    '''
    IE_DESC = 'Channel 9'
    IE_NAME = 'channel9'
    _VALID_URL = r'^https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'

    _TESTS = [
        {
            'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
            'file': 'Events_TechEd_Australia_2013_KOS002.mp4',
            'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
            'info_dict': {
                'title': 'Developer Kick-Off Session: Stuff We Love',
                'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
                'duration': 4576,
                'thumbnail': 'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
                'session_code': 'KOS002',
                'session_day': 'Day 1',
                'session_room': 'Arena 1A',
                'session_speakers': [ 'Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen' ],
            },
        },
        {
            'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
            'file': 'posts_Self-service-BI-with-Power-BI-nuclear-testing.mp4',
            'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
            'info_dict': {
                'title': 'Self-service BI with Power BI - nuclear testing',
                'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
                'duration': 1540,
                'thumbnail': 'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
                'authors': [ 'Mike Wilmot' ],
            },
        }
    ]

    _RSS_URL = 'http://channel9.msdn.com/%s/RSS'

    # Sorted by quality
    _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']

    def _restore_bytes(self, formatted_size):
        if not formatted_size:
            return 0
        m = re.match(r'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size)
        if not m:
            return 0
        units = m.group('units')
        try:
            exponent = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'].index(units.upper())
        except ValueError:
            return 0
        size = float(m.group('size'))
        return int(size * (1024 ** exponent))

    def _formats_from_html(self, html):
        FORMAT_REGEX = r'''
            (?x)
            <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
            <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
            (?:<div\s+class="popup\s+rounded">\s*
            <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
            </div>)?                                                # File size part may be missing
        '''
        # Extract known formats
        formats = [{
            'url': x.group('url'),
            'format_id': x.group('quality'),
            'format_note': x.group('note'),
            'format': '%s (%s)' % (x.group('quality'), x.group('note')),
            'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate
            'preference': self._known_formats.index(x.group('quality')),
            'vcodec': 'none' if x.group('note') == 'Audio only' else None,
        } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]

        self._sort_formats(formats)

        return formats

    def _extract_title(self, html):
        title = self._html_search_meta('title', html, 'title')
        if title is None:           
            title = self._og_search_title(html)
            TITLE_SUFFIX = ' (Channel 9)'
            if title is not None and title.endswith(TITLE_SUFFIX):
                title = title[:-len(TITLE_SUFFIX)]
        return title

    def _extract_description(self, html):
        DESCRIPTION_REGEX = r'''(?sx)
            <div\s+class="entry-content">\s*
            <div\s+id="entry-body">\s*
            (?P<description>.+?)\s*
            </div>\s*
            </div>
        '''
        m = re.search(DESCRIPTION_REGEX, html)
        if m is not None:
            return m.group('description')
        return self._html_search_meta('description', html, 'description')

    def _extract_duration(self, html):
        m = re.search(r'data-video_duration="(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
        return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None

    def _extract_slides(self, html):
        m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
        return m.group('slidesurl') if m is not None else None

    def _extract_zip(self, html):
        m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
        return m.group('zipurl') if m is not None else None

    def _extract_avg_rating(self, html):
        m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
        return float(m.group('avgrating')) if m is not None else 0

    def _extract_rating_count(self, html):
        m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
        return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0

    def _extract_view_count(self, html):
        m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
        return int(self._fix_count(m.group('viewcount'))) if m is not None else 0

    def _extract_comment_count(self, html):
        m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
        return int(self._fix_count(m.group('commentcount'))) if m is not None else 0

    def _fix_count(self, count):
        return int(str(count).replace(',', '')) if count is not None else None

    def _extract_authors(self, html):
        m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
        if m is None:
            return None
        return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))

    def _extract_session_code(self, html):
        m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
        return m.group('code') if m is not None else None

    def _extract_session_day(self, html):
        m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
        return m.group('day') if m is not None else None

    def _extract_session_room(self, html):
        m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
        return m.group('room') if m is not None else None

    def _extract_session_speakers(self, html):
        return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)

    def _extract_content(self, html, content_path):
        # Look for downloadable content        
        formats = self._formats_from_html(html)
        slides = self._extract_slides(html)
        zip_ = self._extract_zip(html)

        # Nothing to download
        if len(formats) == 0 and slides is None and zip_ is None:
            self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path)
            return

        # Extract meta
        title = self._extract_title(html)
        description = self._extract_description(html)
        thumbnail = self._og_search_thumbnail(html)
        duration = self._extract_duration(html)
        avg_rating = self._extract_avg_rating(html)
        rating_count = self._extract_rating_count(html)
        view_count = self._extract_view_count(html)
        comment_count = self._extract_comment_count(html)

        common = {'_type': 'video',
                  'id': content_path,
                  'description': description,
                  'thumbnail': thumbnail,
                  'duration': duration,
                  'avg_rating': avg_rating,
                  'rating_count': rating_count,
                  'view_count': view_count,
                  'comment_count': comment_count,
                }

        result = []

        if slides is not None:
            d = common.copy()
            d.update({ 'title': title + '-Slides', 'url': slides })
            result.append(d)

        if zip_ is not None:
            d = common.copy()
            d.update({ 'title': title + '-Zip', 'url': zip_ })
            result.append(d)

        if len(formats) > 0:
            d = common.copy()
            d.update({ 'title': title, 'formats': formats })
            result.append(d)

        return result

    def _extract_entry_item(self, html, content_path):
        contents = self._extract_content(html, content_path)
        if contents is None:
            return contents

        authors = self._extract_authors(html)

        for content in contents:
            content['authors'] = authors

        return contents

    def _extract_session(self, html, content_path):
        contents = self._extract_content(html, content_path)
        if contents is None:
            return contents

        session_meta = {'session_code': self._extract_session_code(html),
                        'session_day': self._extract_session_day(html),
                        'session_room': self._extract_session_room(html),
                        'session_speakers': self._extract_session_speakers(html),
                        }

        for content in contents:
            content.update(session_meta)

        return contents

    def _extract_list(self, content_path):
        rss = self._download_xml(self._RSS_URL % content_path, content_path, 'Downloading RSS')
        entries = [self.url_result(session_url.text, 'Channel9')
                   for session_url in rss.findall('./channel/item/link')]
        title_text = rss.find('./channel/title').text
        return self.playlist_result(entries, content_path, title_text)

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        content_path = mobj.group('contentpath')

        webpage = self._download_webpage(url, content_path, 'Downloading web page')

        page_type_m = re.search(r'<meta name="Search.PageType" content="(?P<pagetype>[^"]+)"/>', webpage)
        if page_type_m is None:
            raise ExtractorError('Search.PageType not found, don\'t know how to process this page', expected=True)

        page_type = page_type_m.group('pagetype')
        if page_type == 'List':         # List page, may contain list of 'item'-like objects
            return self._extract_list(content_path)
        elif page_type == 'Entry.Item': # Any 'item'-like page, may contain downloadable content
            return self._extract_entry_item(webpage, content_path)
        elif page_type == 'Session':    # Event session page, may contain downloadable content
            return self._extract_session(webpage, content_path)
        else:
            raise ExtractorError('Unexpected Search.PageType %s' % page_type, expected=True)
Commit	Line	Data
adc267ee	1	from __future__ import unicode_literals
df537474	2
	3	import re
	4
	5	from .common import InfoExtractor
4d2ebb6b	6	from ..utils import ExtractorError
df537474	7
	8	class Channel9IE(InfoExtractor):
	9	'''
	10	Common extractor for channel9.msdn.com.
	11
	12	The type of provided URL (video or playlist) is determined according to
	13	meta Search.PageType from web page HTML rather than URL itself, as it is
adc267ee	14	not always possible to do.
df537474	15	'''
adc267ee	16	IE_DESC = 'Channel 9'
adc267ee	17	IE_NAME = 'channel9'
df537474	18	_VALID_URL = r'^https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
	19
	20	_TESTS = [
	21	{
adc267ee	22	'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
	23	'file': 'Events_TechEd_Australia_2013_KOS002.mp4',
	24	'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
	25	'info_dict': {
	26	'title': 'Developer Kick-Off Session: Stuff We Love',
	27	'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
	28	'duration': 4576,
	29	'thumbnail': 'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
	30	'session_code': 'KOS002',
	31	'session_day': 'Day 1',
	32	'session_room': 'Arena 1A',
	33	'session_speakers': [ 'Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen' ],
df537474	34	},
	35	},
	36	{
adc267ee	37	'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
	38	'file': 'posts_Self-service-BI-with-Power-BI-nuclear-testing.mp4',
	39	'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
	40	'info_dict': {
	41	'title': 'Self-service BI with Power BI - nuclear testing',
	42	'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
	43	'duration': 1540,
	44	'thumbnail': 'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
	45	'authors': [ 'Mike Wilmot' ],
df537474	46	},
	47	}
	48	]
	49
	50	_RSS_URL = 'http://channel9.msdn.com/%s/RSS'
df537474	51
	52	# Sorted by quality
	53	_known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']
	54
	55	def _restore_bytes(self, formatted_size):
	56	if not formatted_size:
	57	return 0
	58	m = re.match(r'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size)
	59	if not m:
	60	return 0
	61	units = m.group('units')
	62	try:
adc267ee	63	exponent = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'].index(units.upper())
df537474	64	except ValueError:
	65	return 0
	66	size = float(m.group('size'))
	67	return int(size * (1024 ** exponent))
	68
	69	def _formats_from_html(self, html):
	70	FORMAT_REGEX = r'''
	71	(?x)
	72	<a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
	73	<span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
	74	(?:<div\s+class="popup\s+rounded">\s*
	75	<h3>File\s+size</h3>\s(?P<filesize>.?)\s*
	76	</div>)? # File size part may be missing
	77	'''
	78	# Extract known formats
a1b92edb PH	79	formats = [{
	80	'url': x.group('url'),
	81	'format_id': x.group('quality'),
	82	'format_note': x.group('note'),
adc267ee	83	'format': '%s (%s)' % (x.group('quality'), x.group('note')),
a1b92edb PH	84	'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate
	85	'preference': self._known_formats.index(x.group('quality')),
	86	'vcodec': 'none' if x.group('note') == 'Audio only' else None,
	87	} for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
	88
	89	self._sort_formats(formats)
	90
df537474	91	return formats
df537474	92
df537474	93	def _extract_title(self, html):
adc267ee	94	title = self._html_search_meta('title', html, 'title')
df537474	95	if title is None:
df537474	96	title = self._og_search_title(html)
adc267ee	97	TITLE_SUFFIX = ' (Channel 9)'
df537474	98	if title is not None and title.endswith(TITLE_SUFFIX):
	99	title = title[:-len(TITLE_SUFFIX)]
	100	return title
	101
	102	def _extract_description(self, html):
	103	DESCRIPTION_REGEX = r'''(?sx)
	104	<div\s+class="entry-content">\s*
	105	<div\s+id="entry-body">\s*
	106	(?P<description>.+?)\s*
	107	</div>\s*
	108	</div>
	109	'''
	110	m = re.search(DESCRIPTION_REGEX, html)
	111	if m is not None:
	112	return m.group('description')
adc267ee	113	return self._html_search_meta('description', html, 'description')
df537474	114
	115	def _extract_duration(self, html):
	116	m = re.search(r'data-video_duration="(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
	117	return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
	118
	119	def _extract_slides(self, html):
	120	m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
	121	return m.group('slidesurl') if m is not None else None
	122
	123	def _extract_zip(self, html):
	124	m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
	125	return m.group('zipurl') if m is not None else None
	126
	127	def _extract_avg_rating(self, html):
	128	m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
	129	return float(m.group('avgrating')) if m is not None else 0
	130
	131	def _extract_rating_count(self, html):
	132	m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
	133	return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
	134
	135	def _extract_view_count(self, html):
	136	m = re.search(r'<li class="views">\s<span class="count">(?P<viewcount>[^<]+)</span> Views\s</li>', html)
	137	return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
	138
	139	def _extract_comment_count(self, html):
	140	m = re.search(r'<li class="comments">\s<a href="#comments">\s<span class="count">(?P<commentcount>[^<]+)</span> Comments\s</a>\s</li>', html)
	141	return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
	142
	143	def _fix_count(self, count):
	144	return int(str(count).replace(',', '')) if count is not None else None
	145
	146	def _extract_authors(self, html):
	147	m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
	148	if m is None:
	149	return None
	150	return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
	151
	152	def _extract_session_code(self, html):
	153	m = re.search(r'<li class="code">\s(?P<code>.+?)\s</li>', html)
	154	return m.group('code') if m is not None else None
	155
	156	def _extract_session_day(self, html):
	157	m = re.search(r'<li class="day">\s<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s</li>', html)
	158	return m.group('day') if m is not None else None
	159
	160	def _extract_session_room(self, html):
	161	m = re.search(r'<li class="room">\s(?P<room>.+?)\s</li>', html)
	162	return m.group('room') if m is not None else None
	163
	164	def _extract_session_speakers(self, html):
	165	return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
	166
	167	def _extract_content(self, html, content_path):
	168	# Look for downloadable content
	169	formats = self._formats_from_html(html)
	170	slides = self._extract_slides(html)
	171	zip_ = self._extract_zip(html)
	172
	173	# Nothing to download
	174	if len(formats) == 0 and slides is None and zip_ is None:
adc267ee	175	self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path)
df537474	176	return
	177
	178	# Extract meta
	179	title = self._extract_title(html)
	180	description = self._extract_description(html)
	181	thumbnail = self._og_search_thumbnail(html)
	182	duration = self._extract_duration(html)
	183	avg_rating = self._extract_avg_rating(html)
	184	rating_count = self._extract_rating_count(html)
	185	view_count = self._extract_view_count(html)
	186	comment_count = self._extract_comment_count(html)
	187
	188	common = {'_type': 'video',
	189	'id': content_path,
	190	'description': description,
	191	'thumbnail': thumbnail,
	192	'duration': duration,
	193	'avg_rating': avg_rating,
	194	'rating_count': rating_count,
	195	'view_count': view_count,
	196	'comment_count': comment_count,
	197	}
	198
	199	result = []
	200
	201	if slides is not None:
	202	d = common.copy()
	203	d.update({ 'title': title + '-Slides', 'url': slides })
	204	result.append(d)
	205
	206	if zip_ is not None:
	207	d = common.copy()
	208	d.update({ 'title': title + '-Zip', 'url': zip_ })
	209	result.append(d)
	210
	211	if len(formats) > 0:
	212	d = common.copy()
	213	d.update({ 'title': title, 'formats': formats })
	214	result.append(d)
	215
	216	return result
	217
	218	def _extract_entry_item(self, html, content_path):
	219	contents = self._extract_content(html, content_path)
	220	if contents is None:
	221	return contents
	222
	223	authors = self._extract_authors(html)
	224
	225	for content in contents:
	226	content['authors'] = authors
	227
	228	return contents
	229
	230	def _extract_session(self, html, content_path):
	231	contents = self._extract_content(html, content_path)
	232	if contents is None:
	233	return contents
	234
	235	session_meta = {'session_code': self._extract_session_code(html),
	236	'session_day': self._extract_session_day(html),
	237	'session_room': self._extract_session_room(html),
	238	'session_speakers': self._extract_session_speakers(html),
	239	}
240
241	for content in contents:
242	content.update(session_meta)
243
244	return contents
245
df537474	246	def _extract_list(self, content_path):
adc267ee	247	rss = self._download_xml(self._RSS_URL % content_path, content_path, 'Downloading RSS')
4d2ebb6b	248	entries = [self.url_result(session_url.text, 'Channel9')
	249	for session_url in rss.findall('./channel/item/link')]
	250	title_text = rss.find('./channel/title').text
	251	return self.playlist_result(entries, content_path, title_text)
df537474	252
	253	def _real_extract(self, url):
	254	mobj = re.match(self._VALID_URL, url)
	255	content_path = mobj.group('contentpath')
	256
adc267ee	257	webpage = self._download_webpage(url, content_path, 'Downloading web page')
df537474	258
	259	page_type_m = re.search(r'<meta name="Search.PageType" content="(?P<pagetype>[^"]+)"/>', webpage)
	260	if page_type_m is None:
adc267ee	261	raise ExtractorError('Search.PageType not found, don\'t know how to process this page', expected=True)
df537474	262
	263	page_type = page_type_m.group('pagetype')
	264	if page_type == 'List': # List page, may contain list of 'item'-like objects
	265	return self._extract_list(content_path)
	266	elif page_type == 'Entry.Item': # Any 'item'-like page, may contain downloadable content
	267	return self._extract_entry_item(webpage, content_path)
	268	elif page_type == 'Session': # Event session page, may contain downloadable content
	269	return self._extract_session(webpage, content_path)
	270	else:
adc267ee	271	raise ExtractorError('Unexpected Search.PageType %s' % page_type, expected=True)