[yt-dlp.git] / youtube_dl / extractor / mlb.py

from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..utils import (
    parse_duration,
    parse_iso8601,
    find_xpath_attr,
)


class MLBIE(InfoExtractor):
    _VALID_URL = r'http?://m\.mlb\.com/video/(?:topic/[\da-z_-]+/)?v(?P<id>n?\d+)'
    _TESTS = [
        {
            'url': 'http://m.mlb.com/video/topic/81536970/v34496663/mianym-stanton-practices-for-the-home-run-derby',
            'md5': 'd9c022c10d21f849f49c05ae12a8a7e9',
            'info_dict': {
                'id': '34496663',
                'ext': 'mp4',
                'title': 'Stanton prepares for Derby',
                'description': 'md5:d00ce1e5fd9c9069e9c13ab4faedfa57',
                'duration': 46,
                'timestamp': 1405105800,
                'upload_date': '20140711',
                'thumbnail': 're:^https?://.*\.jpg$',
            },
        },
        {
            'url': 'http://m.mlb.com/video/topic/vtp_hrd_sponsor/v34578115/hrd-cespedes-wins-2014-gillette-home-run-derby',
            'md5': '0e6e73d509321e142409b695eadd541f',
            'info_dict': {
                'id': '34578115',
                'ext': 'mp4',
                'title': 'Cespedes repeats as Derby champ',
                'description': 'md5:08df253ce265d4cf6fb09f581fafad07',
                'duration': 488,
                'timestamp': 1405399936,
                'upload_date': '20140715',
                'thumbnail': 're:^https?://.*\.jpg$',
            },
        },
        {
            'url': 'http://m.mlb.com/video/v34577915/bautista-on-derby-captaining-duties-his-performance',
            'md5': 'b8fd237347b844365d74ea61d4245967',
            'info_dict': {
                'id': '34577915',
                'ext': 'mp4',
                'title': 'Bautista on Home Run Derby',
                'description': 'md5:b80b34031143d0986dddc64a8839f0fb',
                'duration': 52,
                'timestamp': 1405390722,
                'upload_date': '20140715',
                'thumbnail': 're:^https?://.*\.jpg$',
            },
        },
    ]

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        video_id = mobj.group('id')

        detail = self._download_xml(
            'http://m.mlb.com/gen/multimedia/detail/%s/%s/%s/%s.xml'
            % (video_id[-3], video_id[-2], video_id[-1], video_id), video_id)

        title = detail.find('./headline').text
        description = detail.find('./big-blurb').text
        duration = parse_duration(detail.find('./duration').text)
        timestamp = parse_iso8601(detail.attrib['date'][:-5])

        thumbnail = find_xpath_attr(
            detail, './thumbnailScenarios/thumbnailScenario', 'type', '45').text

        formats = []
        for media_url in detail.findall('./url'):
            playback_scenario = media_url.attrib['playback_scenario']
            fmt = {
                'url': media_url.text,
                'format_id': playback_scenario,
            }
            m = re.search(r'(?P<vbr>\d+)K_(?P<width>\d+)X(?P<height>\d+)', playback_scenario)
            if m:
                fmt.update({
                    'vbr': int(m.group('vbr')) * 1000,
                    'width': int(m.group('width')),
                    'height': int(m.group('height')),
                })
            formats.append(fmt)

        self._sort_formats(formats)

        return {
            'id': video_id,
            'title': title,
            'description': description,
            'duration': duration,
            'timestamp': timestamp,
            'formats': formats,
            'thumbnail': thumbnail,
        }
Commit	Line	Data
b1b01841 CC	1	from __future__ import unicode_literals
	2
	3	import re
	4
	5	from .common import InfoExtractor
7bb49d10 S	6	from ..utils import (
	7	parse_duration,
	8	parse_iso8601,
	9	find_xpath_attr,
	10	)
b1b01841 CC	11
b1b01841 CC	12
7bb49d10 S	13	class MLBIE(InfoExtractor):
	14	_VALID_URL = r'http?://m\.mlb\.com/video/(?:topic/[\da-z_-]+/)?v(?P<id>n?\d+)'
	15	_TESTS = [
	16	{
	17	'url': 'http://m.mlb.com/video/topic/81536970/v34496663/mianym-stanton-practices-for-the-home-run-derby',
	18	'md5': 'd9c022c10d21f849f49c05ae12a8a7e9',
	19	'info_dict': {
	20	'id': '34496663',
	21	'ext': 'mp4',
	22	'title': 'Stanton prepares for Derby',
	23	'description': 'md5:d00ce1e5fd9c9069e9c13ab4faedfa57',
	24	'duration': 46,
	25	'timestamp': 1405105800,
	26	'upload_date': '20140711',
	27	'thumbnail': 're:^https?://.*\.jpg$',
	28	},
b1b01841	29	},
7bb49d10 S	30	{
	31	'url': 'http://m.mlb.com/video/topic/vtp_hrd_sponsor/v34578115/hrd-cespedes-wins-2014-gillette-home-run-derby',
	32	'md5': '0e6e73d509321e142409b695eadd541f',
	33	'info_dict': {
	34	'id': '34578115',
	35	'ext': 'mp4',
	36	'title': 'Cespedes repeats as Derby champ',
	37	'description': 'md5:08df253ce265d4cf6fb09f581fafad07',
	38	'duration': 488,
	39	'timestamp': 1405399936,
	40	'upload_date': '20140715',
	41	'thumbnail': 're:^https?://.*\.jpg$',
	42	},
	43	},
	44	{
	45	'url': 'http://m.mlb.com/video/v34577915/bautista-on-derby-captaining-duties-his-performance',
	46	'md5': 'b8fd237347b844365d74ea61d4245967',
	47	'info_dict': {
	48	'id': '34577915',
	49	'ext': 'mp4',
	50	'title': 'Bautista on Home Run Derby',
	51	'description': 'md5:b80b34031143d0986dddc64a8839f0fb',
	52	'duration': 52,
	53	'timestamp': 1405390722,
	54	'upload_date': '20140715',
	55	'thumbnail': 're:^https?://.*\.jpg$',
	56	},
	57	},
	58	]
b1b01841 CC	59
	60	def _real_extract(self, url):
	61	mobj = re.match(self._VALID_URL, url)
	62	video_id = mobj.group('id')
	63
7bb49d10 S	64	detail = self._download_xml(
	65	'http://m.mlb.com/gen/multimedia/detail/%s/%s/%s/%s.xml'
	66	% (video_id[-3], video_id[-2], video_id[-1], video_id), video_id)
	67
	68	title = detail.find('./headline').text
	69	description = detail.find('./big-blurb').text
	70	duration = parse_duration(detail.find('./duration').text)
	71	timestamp = parse_iso8601(detail.attrib['date'][:-5])
	72
	73	thumbnail = find_xpath_attr(
	74	detail, './thumbnailScenarios/thumbnailScenario', 'type', '45').text
b1b01841	75
7bb49d10 S	76	formats = []
	77	for media_url in detail.findall('./url'):
	78	playback_scenario = media_url.attrib['playback_scenario']
	79	fmt = {
	80	'url': media_url.text,
	81	'format_id': playback_scenario,
	82	}
	83	m = re.search(r'(?P<vbr>\d+)K_(?P<width>\d+)X(?P<height>\d+)', playback_scenario)
	84	if m:
	85	fmt.update({
	86	'vbr': int(m.group('vbr')) * 1000,
	87	'width': int(m.group('width')),
	88	'height': int(m.group('height')),
	89	})
	90	formats.append(fmt)
172240c0	91
7bb49d10	92	self._sort_formats(formats)
172240c0	93
b1b01841 CC	94	return {
b1b01841 CC	95	'id': video_id,
b1b01841	96	'title': title,
b1b01841	97	'description': description,
7bb49d10 S	98	'duration': duration,
	99	'timestamp': timestamp,
	100	'formats': formats,
b1b01841 CC	101	'thumbnail': thumbnail,
b1b01841 CC	102	}