[yt-dlp.git] / youtube_dl / extractor / channel9.py

# encoding: utf-8

import re

from .common import InfoExtractor
from ..utils import ExtractorError

class Channel9IE(InfoExtractor):
    '''
    Common extractor for channel9.msdn.com.

    The type of provided URL (video or playlist) is determined according to
    meta Search.PageType from web page HTML rather than URL itself, as it is
    not always possible to do.    
    '''
    IE_DESC = u'Channel 9'
    IE_NAME = u'channel9'
    _VALID_URL = r'^https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'

    _TESTS = [
        {
            u'url': u'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
            u'file': u'Events_TechEd_Australia_2013_KOS002.mp4',
            u'md5': u'bbd75296ba47916b754e73c3a4bbdf10',
            u'info_dict': {
                u'title': u'Developer Kick-Off Session: Stuff We Love',
                u'description': u'md5:c08d72240b7c87fcecafe2692f80e35f',
                u'duration': 4576,
                u'thumbnail': u'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
                u'session_code': u'KOS002',
                u'session_day': u'Day 1',
                u'session_room': u'Arena 1A',
                u'session_speakers': [ u'Ed Blankenship', u'Andrew Coates', u'Brady Gaster', u'Patrick Klug', u'Mads Kristensen' ],
            },
        },
        {
            u'url': u'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
            u'file': u'posts_Self-service-BI-with-Power-BI-nuclear-testing.mp4',
            u'md5': u'b43ee4529d111bc37ba7ee4f34813e68',
            u'info_dict': {
                u'title': u'Self-service BI with Power BI - nuclear testing',
                u'description': u'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
                u'duration': 1540,
                u'thumbnail': u'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
                u'authors': [ u'Mike Wilmot' ],
            },
        }
    ]

    _RSS_URL = 'http://channel9.msdn.com/%s/RSS'

    # Sorted by quality
    _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']

    def _restore_bytes(self, formatted_size):
        if not formatted_size:
            return 0
        m = re.match(r'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size)
        if not m:
            return 0
        units = m.group('units')
        try:
            exponent = [u'B', u'KB', u'MB', u'GB', u'TB', u'PB', u'EB', u'ZB', u'YB'].index(units.upper())
        except ValueError:
            return 0
        size = float(m.group('size'))
        return int(size * (1024 ** exponent))

    def _formats_from_html(self, html):
        FORMAT_REGEX = r'''
            (?x)
            <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
            <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
            (?:<div\s+class="popup\s+rounded">\s*
            <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
            </div>)?                                                # File size part may be missing
        '''
        # Extract known formats
        formats = [{
            'url': x.group('url'),
            'format_id': x.group('quality'),
            'format_note': x.group('note'),
            'format': u'%s (%s)' % (x.group('quality'), x.group('note')),
            'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate
            'preference': self._known_formats.index(x.group('quality')),
            'vcodec': 'none' if x.group('note') == 'Audio only' else None,
        } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]

        self._sort_formats(formats)

        return formats

    def _extract_title(self, html):
        title = self._html_search_meta(u'title', html, u'title')
        if title is None:           
            title = self._og_search_title(html)
            TITLE_SUFFIX = u' (Channel 9)'
            if title is not None and title.endswith(TITLE_SUFFIX):
                title = title[:-len(TITLE_SUFFIX)]
        return title

    def _extract_description(self, html):
        DESCRIPTION_REGEX = r'''(?sx)
            <div\s+class="entry-content">\s*
            <div\s+id="entry-body">\s*
            (?P<description>.+?)\s*
            </div>\s*
            </div>
        '''
        m = re.search(DESCRIPTION_REGEX, html)
        if m is not None:
            return m.group('description')
        return self._html_search_meta(u'description', html, u'description')

    def _extract_duration(self, html):
        m = re.search(r'data-video_duration="(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
        return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None

    def _extract_slides(self, html):
        m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
        return m.group('slidesurl') if m is not None else None

    def _extract_zip(self, html):
        m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
        return m.group('zipurl') if m is not None else None

    def _extract_avg_rating(self, html):
        m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
        return float(m.group('avgrating')) if m is not None else 0

    def _extract_rating_count(self, html):
        m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
        return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0

    def _extract_view_count(self, html):
        m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
        return int(self._fix_count(m.group('viewcount'))) if m is not None else 0

    def _extract_comment_count(self, html):
        m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
        return int(self._fix_count(m.group('commentcount'))) if m is not None else 0

    def _fix_count(self, count):
        return int(str(count).replace(',', '')) if count is not None else None

    def _extract_authors(self, html):
        m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
        if m is None:
            return None
        return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))

    def _extract_session_code(self, html):
        m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
        return m.group('code') if m is not None else None

    def _extract_session_day(self, html):
        m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
        return m.group('day') if m is not None else None

    def _extract_session_room(self, html):
        m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
        return m.group('room') if m is not None else None

    def _extract_session_speakers(self, html):
        return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)

    def _extract_content(self, html, content_path):
        # Look for downloadable content        
        formats = self._formats_from_html(html)
        slides = self._extract_slides(html)
        zip_ = self._extract_zip(html)

        # Nothing to download
        if len(formats) == 0 and slides is None and zip_ is None:
            self._downloader.report_warning(u'None of recording, slides or zip are available for %s' % content_path)
            return

        # Extract meta
        title = self._extract_title(html)
        description = self._extract_description(html)
        thumbnail = self._og_search_thumbnail(html)
        duration = self._extract_duration(html)
        avg_rating = self._extract_avg_rating(html)
        rating_count = self._extract_rating_count(html)
        view_count = self._extract_view_count(html)
        comment_count = self._extract_comment_count(html)

        common = {'_type': 'video',
                  'id': content_path,
                  'description': description,
                  'thumbnail': thumbnail,
                  'duration': duration,
                  'avg_rating': avg_rating,
                  'rating_count': rating_count,
                  'view_count': view_count,
                  'comment_count': comment_count,
                }

        result = []

        if slides is not None:
            d = common.copy()
            d.update({ 'title': title + '-Slides', 'url': slides })
            result.append(d)

        if zip_ is not None:
            d = common.copy()
            d.update({ 'title': title + '-Zip', 'url': zip_ })
            result.append(d)

        if len(formats) > 0:
            d = common.copy()
            d.update({ 'title': title, 'formats': formats })
            result.append(d)

        return result

    def _extract_entry_item(self, html, content_path):
        contents = self._extract_content(html, content_path)
        if contents is None:
            return contents

        authors = self._extract_authors(html)

        for content in contents:
            content['authors'] = authors

        return contents

    def _extract_session(self, html, content_path):
        contents = self._extract_content(html, content_path)
        if contents is None:
            return contents

        session_meta = {'session_code': self._extract_session_code(html),
                        'session_day': self._extract_session_day(html),
                        'session_room': self._extract_session_room(html),
                        'session_speakers': self._extract_session_speakers(html),
                        }

        for content in contents:
            content.update(session_meta)

        return contents

    def _extract_list(self, content_path):
        rss = self._download_xml(self._RSS_URL % content_path, content_path, u'Downloading RSS')
        entries = [self.url_result(session_url.text, 'Channel9')
                   for session_url in rss.findall('./channel/item/link')]
        title_text = rss.find('./channel/title').text
        return self.playlist_result(entries, content_path, title_text)

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        content_path = mobj.group('contentpath')

        webpage = self._download_webpage(url, content_path, u'Downloading web page')

        page_type_m = re.search(r'<meta name="Search.PageType" content="(?P<pagetype>[^"]+)"/>', webpage)
        if page_type_m is None:
            raise ExtractorError(u'Search.PageType not found, don\'t know how to process this page', expected=True)

        page_type = page_type_m.group('pagetype')
        if page_type == 'List':         # List page, may contain list of 'item'-like objects
            return self._extract_list(content_path)
        elif page_type == 'Entry.Item': # Any 'item'-like page, may contain downloadable content
            return self._extract_entry_item(webpage, content_path)
        elif page_type == 'Session':    # Event session page, may contain downloadable content
            return self._extract_session(webpage, content_path)
        else:
            raise ExtractorError(u'Unexpected Search.PageType %s' % page_type, expected=True)
Commit	Line	Data
df537474	1	# encoding: utf-8
	2
	3	import re
	4
	5	from .common import InfoExtractor
4d2ebb6b	6	from ..utils import ExtractorError
df537474	7
	8	class Channel9IE(InfoExtractor):
	9	'''
	10	Common extractor for channel9.msdn.com.
	11
	12	The type of provided URL (video or playlist) is determined according to
	13	meta Search.PageType from web page HTML rather than URL itself, as it is
	14	not always possible to do.
	15	'''
	16	IE_DESC = u'Channel 9'
	17	IE_NAME = u'channel9'
	18	_VALID_URL = r'^https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
	19
	20	_TESTS = [
	21	{
	22	u'url': u'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
	23	u'file': u'Events_TechEd_Australia_2013_KOS002.mp4',
	24	u'md5': u'bbd75296ba47916b754e73c3a4bbdf10',
	25	u'info_dict': {
	26	u'title': u'Developer Kick-Off Session: Stuff We Love',
	27	u'description': u'md5:c08d72240b7c87fcecafe2692f80e35f',
	28	u'duration': 4576,
	29	u'thumbnail': u'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
	30	u'session_code': u'KOS002',
	31	u'session_day': u'Day 1',
	32	u'session_room': u'Arena 1A',
	33	u'session_speakers': [ u'Ed Blankenship', u'Andrew Coates', u'Brady Gaster', u'Patrick Klug', u'Mads Kristensen' ],
	34	},
	35	},
	36	{
	37	u'url': u'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
	38	u'file': u'posts_Self-service-BI-with-Power-BI-nuclear-testing.mp4',
	39	u'md5': u'b43ee4529d111bc37ba7ee4f34813e68',
	40	u'info_dict': {
	41	u'title': u'Self-service BI with Power BI - nuclear testing',
9b17ba0f	42	u'description': u'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
df537474	43	u'duration': 1540,
	44	u'thumbnail': u'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
	45	u'authors': [ u'Mike Wilmot' ],
	46	},
	47	}
	48	]
	49
	50	_RSS_URL = 'http://channel9.msdn.com/%s/RSS'
df537474	51
	52	# Sorted by quality
	53	_known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']
	54
	55	def _restore_bytes(self, formatted_size):
	56	if not formatted_size:
	57	return 0
	58	m = re.match(r'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size)
	59	if not m:
	60	return 0
	61	units = m.group('units')
	62	try:
	63	exponent = [u'B', u'KB', u'MB', u'GB', u'TB', u'PB', u'EB', u'ZB', u'YB'].index(units.upper())
	64	except ValueError:
	65	return 0
	66	size = float(m.group('size'))
	67	return int(size * (1024 ** exponent))
	68
	69	def _formats_from_html(self, html):
	70	FORMAT_REGEX = r'''
	71	(?x)
	72	<a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
	73	<span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
	74	(?:<div\s+class="popup\s+rounded">\s*
	75	<h3>File\s+size</h3>\s(?P<filesize>.?)\s*
	76	</div>)? # File size part may be missing
	77	'''
	78	# Extract known formats
a1b92edb PH	79	formats = [{
	80	'url': x.group('url'),
	81	'format_id': x.group('quality'),
	82	'format_note': x.group('note'),
	83	'format': u'%s (%s)' % (x.group('quality'), x.group('note')),
	84	'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate
	85	'preference': self._known_formats.index(x.group('quality')),
	86	'vcodec': 'none' if x.group('note') == 'Audio only' else None,
	87	} for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
	88
	89	self._sort_formats(formats)
	90
df537474	91	return formats
df537474	92
df537474	93	def _extract_title(self, html):
	94	title = self._html_search_meta(u'title', html, u'title')
	95	if title is None:
	96	title = self._og_search_title(html)
	97	TITLE_SUFFIX = u' (Channel 9)'
	98	if title is not None and title.endswith(TITLE_SUFFIX):
	99	title = title[:-len(TITLE_SUFFIX)]
	100	return title
	101
	102	def _extract_description(self, html):
	103	DESCRIPTION_REGEX = r'''(?sx)
	104	<div\s+class="entry-content">\s*
	105	<div\s+id="entry-body">\s*
	106	(?P<description>.+?)\s*
	107	</div>\s*
	108	</div>
	109	'''
	110	m = re.search(DESCRIPTION_REGEX, html)
	111	if m is not None:
	112	return m.group('description')
	113	return self._html_search_meta(u'description', html, u'description')
	114
	115	def _extract_duration(self, html):
	116	m = re.search(r'data-video_duration="(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
	117	return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
	118
	119	def _extract_slides(self, html):
	120	m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
	121	return m.group('slidesurl') if m is not None else None
	122
	123	def _extract_zip(self, html):
	124	m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
	125	return m.group('zipurl') if m is not None else None
	126
	127	def _extract_avg_rating(self, html):
	128	m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
	129	return float(m.group('avgrating')) if m is not None else 0
	130
	131	def _extract_rating_count(self, html):
	132	m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
	133	return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
	134
	135	def _extract_view_count(self, html):
	136	m = re.search(r'<li class="views">\s<span class="count">(?P<viewcount>[^<]+)</span> Views\s</li>', html)
	137	return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
	138
	139	def _extract_comment_count(self, html):
	140	m = re.search(r'<li class="comments">\s<a href="#comments">\s<span class="count">(?P<commentcount>[^<]+)</span> Comments\s</a>\s</li>', html)
	141	return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
	142
	143	def _fix_count(self, count):
	144	return int(str(count).replace(',', '')) if count is not None else None
	145
	146	def _extract_authors(self, html):
	147	m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
	148	if m is None:
	149	return None
	150	return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
	151
	152	def _extract_session_code(self, html):
	153	m = re.search(r'<li class="code">\s(?P<code>.+?)\s</li>', html)
	154	return m.group('code') if m is not None else None
	155
	156	def _extract_session_day(self, html):
157	m = re.search(r'<li class="day">\s<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s</li>', html)
158	return m.group('day') if m is not None else None
159
160	def _extract_session_room(self, html):
161	m = re.search(r'<li class="room">\s(?P<room>.+?)\s</li>', html)
162	return m.group('room') if m is not None else None
163
164	def _extract_session_speakers(self, html):
165	return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
166
167	def _extract_content(self, html, content_path):
168	# Look for downloadable content
169	formats = self._formats_from_html(html)
170	slides = self._extract_slides(html)
171	zip_ = self._extract_zip(html)
172
173	# Nothing to download
174	if len(formats) == 0 and slides is None and zip_ is None:
175	self._downloader.report_warning(u'None of recording, slides or zip are available for %s' % content_path)
176	return
177
178	# Extract meta
179	title = self._extract_title(html)
180	description = self._extract_description(html)
181	thumbnail = self._og_search_thumbnail(html)
182	duration = self._extract_duration(html)
183	avg_rating = self._extract_avg_rating(html)
184	rating_count = self._extract_rating_count(html)
185	view_count = self._extract_view_count(html)
186	comment_count = self._extract_comment_count(html)
187
188	common = {'_type': 'video',
189	'id': content_path,
190	'description': description,
191	'thumbnail': thumbnail,
192	'duration': duration,
193	'avg_rating': avg_rating,
194	'rating_count': rating_count,
195	'view_count': view_count,
196	'comment_count': comment_count,
197	}
198
199	result = []
200
201	if slides is not None:
202	d = common.copy()
203	d.update({ 'title': title + '-Slides', 'url': slides })
204	result.append(d)
205
206	if zip_ is not None:
207	d = common.copy()
208	d.update({ 'title': title + '-Zip', 'url': zip_ })
209	result.append(d)
210
211	if len(formats) > 0:
212	d = common.copy()
213	d.update({ 'title': title, 'formats': formats })
214	result.append(d)
215
216	return result
217
218	def _extract_entry_item(self, html, content_path):
219	contents = self._extract_content(html, content_path)
220	if contents is None:
221	return contents
222
223	authors = self._extract_authors(html)
224
225	for content in contents:
226	content['authors'] = authors
227
228	return contents
229
230	def _extract_session(self, html, content_path):
231	contents = self._extract_content(html, content_path)
232	if contents is None:
233	return contents
234
235	session_meta = {'session_code': self._extract_session_code(html),
236	'session_day': self._extract_session_day(html),
237	'session_room': self._extract_session_room(html),
238	'session_speakers': self._extract_session_speakers(html),
239	}
240
241	for content in contents:
242	content.update(session_meta)
243
244	return contents
245
df537474	246	def _extract_list(self, content_path):
df537474	247	rss = self._download_xml(self._RSS_URL % content_path, content_path, u'Downloading RSS')
4d2ebb6b	248	entries = [self.url_result(session_url.text, 'Channel9')
	249	for session_url in rss.findall('./channel/item/link')]
	250	title_text = rss.find('./channel/title').text
	251	return self.playlist_result(entries, content_path, title_text)
df537474	252
	253	def _real_extract(self, url):
	254	mobj = re.match(self._VALID_URL, url)
	255	content_path = mobj.group('contentpath')
	256
	257	webpage = self._download_webpage(url, content_path, u'Downloading web page')
	258
	259	page_type_m = re.search(r'<meta name="Search.PageType" content="(?P<pagetype>[^"]+)"/>', webpage)
	260	if page_type_m is None:
	261	raise ExtractorError(u'Search.PageType not found, don\'t know how to process this page', expected=True)
	262
	263	page_type = page_type_m.group('pagetype')
	264	if page_type == 'List': # List page, may contain list of 'item'-like objects
	265	return self._extract_list(content_path)
	266	elif page_type == 'Entry.Item': # Any 'item'-like page, may contain downloadable content
	267	return self._extract_entry_item(webpage, content_path)
	268	elif page_type == 'Session': # Event session page, may contain downloadable content
	269	return self._extract_session(webpage, content_path)
	270	else:
	271	raise ExtractorError(u'Unexpected Search.PageType %s' % page_type, expected=True)