[yt-dlp.git] / youtube_dl / extractor / stitcher.py

from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..utils import (
    determine_ext,
    int_or_none,
    js_to_json,
    unescapeHTML,
)


class StitcherIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?stitcher\.com/podcast/(?:[^/]+/)+e/(?:(?P<display_id>[^/#?&]+?)-)?(?P<id>\d+)(?:[/#?&]|$)'
    _TESTS = [{
        'url': 'http://www.stitcher.com/podcast/the-talking-machines/e/40789481?autoplay=true',
        'md5': '391dd4e021e6edeb7b8e68fbf2e9e940',
        'info_dict': {
            'id': '40789481',
            'ext': 'mp3',
            'title': 'Machine Learning Mastery and Cancer Clusters',
            'description': 'md5:55163197a44e915a14a1ac3a1de0f2d3',
            'duration': 1604,
            'thumbnail': r're:^https?://.*\.jpg',
        },
    }, {
        'url': 'http://www.stitcher.com/podcast/panoply/vulture-tv/e/the-rare-hourlong-comedy-plus-40846275?autoplay=true',
        'info_dict': {
            'id': '40846275',
            'display_id': 'the-rare-hourlong-comedy-plus',
            'ext': 'mp3',
            'title': "The CW's 'Crazy Ex-Girlfriend'",
            'description': 'md5:04f1e2f98eb3f5cbb094cea0f9e19b17',
            'duration': 2235,
            'thumbnail': r're:^https?://.*\.jpg',
        },
        'params': {
            'skip_download': True,
        },
    }, {
        # escaped title
        'url': 'http://www.stitcher.com/podcast/marketplace-on-stitcher/e/40910226?autoplay=true',
        'only_matching': True,
    }, {
        'url': 'http://www.stitcher.com/podcast/panoply/getting-in/e/episode-2a-how-many-extracurriculars-should-i-have-40876278?autoplay=true',
        'only_matching': True,
    }]

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        audio_id = mobj.group('id')
        display_id = mobj.group('display_id') or audio_id

        webpage = self._download_webpage(url, display_id)

        episode = self._parse_json(
            js_to_json(self._search_regex(
                r'(?s)var\s+stitcher(?:Config)?\s*=\s*({.+?});\n', webpage, 'episode config')),
            display_id)['config']['episode']

        title = unescapeHTML(episode['title'])
        formats = [{
            'url': episode[episode_key],
            'ext': determine_ext(episode[episode_key]) or 'mp3',
            'vcodec': 'none',
        } for episode_key in ('episodeURL',) if episode.get(episode_key)]
        description = self._search_regex(
            r'Episode Info:\s*</span>([^<]+)<', webpage, 'description', fatal=False)
        duration = int_or_none(episode.get('duration'))
        thumbnail = episode.get('episodeImage')

        return {
            'id': audio_id,
            'display_id': display_id,
            'title': title,
            'description': description,
            'duration': duration,
            'thumbnail': thumbnail,
            'formats': formats,
        }
Commit	Line	Data
4211c83a	1	from __future__ import unicode_literals
7308b8cb S	2
	3	import re
	4
4211c83a	5	from .common import InfoExtractor
7308b8cb S	6	from ..utils import (
	7	determine_ext,
	8	int_or_none,
	9	js_to_json,
	10	unescapeHTML,
	11	)
4211c83a	12
	13
	14	class StitcherIE(InfoExtractor):
7308b8cb S	15	_VALID_URL = r'https?://(?:www\.)?stitcher\.com/podcast/(?:[^/]+/)+e/(?:(?P<display_id>[^/#?&]+?)-)?(?P<id>\d+)(?:[/#?&]\|$)'
7308b8cb S	16	_TESTS = [{
4211c83a	17	'url': 'http://www.stitcher.com/podcast/the-talking-machines/e/40789481?autoplay=true',
	18	'md5': '391dd4e021e6edeb7b8e68fbf2e9e940',
	19	'info_dict': {
	20	'id': '40789481',
	21	'ext': 'mp3',
7308b8cb S	22	'title': 'Machine Learning Mastery and Cancer Clusters',
	23	'description': 'md5:55163197a44e915a14a1ac3a1de0f2d3',
	24	'duration': 1604,
ec85ded8	25	'thumbnail': r're:^https?://.*\.jpg',
7308b8cb S	26	},
	27	}, {
	28	'url': 'http://www.stitcher.com/podcast/panoply/vulture-tv/e/the-rare-hourlong-comedy-plus-40846275?autoplay=true',
	29	'info_dict': {
	30	'id': '40846275',
	31	'display_id': 'the-rare-hourlong-comedy-plus',
	32	'ext': 'mp3',
	33	'title': "The CW's 'Crazy Ex-Girlfriend'",
	34	'description': 'md5:04f1e2f98eb3f5cbb094cea0f9e19b17',
	35	'duration': 2235,
ec85ded8	36	'thumbnail': r're:^https?://.*\.jpg',
7308b8cb S	37	},
	38	'params': {
	39	'skip_download': True,
	40	},
	41	}, {
	42	# escaped title
	43	'url': 'http://www.stitcher.com/podcast/marketplace-on-stitcher/e/40910226?autoplay=true',
	44	'only_matching': True,
	45	}, {
	46	'url': 'http://www.stitcher.com/podcast/panoply/getting-in/e/episode-2a-how-many-extracurriculars-should-i-have-40876278?autoplay=true',
	47	'only_matching': True,
	48	}]
4211c83a	49
4211c83a	50	def _real_extract(self, url):
7308b8cb S	51	mobj = re.match(self._VALID_URL, url)
	52	audio_id = mobj.group('id')
	53	display_id = mobj.group('display_id') or audio_id
4211c83a	54
7308b8cb	55	webpage = self._download_webpage(url, display_id)
4211c83a	56
7308b8cb S	57	episode = self._parse_json(
7308b8cb S	58	js_to_json(self._search_regex(
5abf513c	59	r'(?s)var\s+stitcher(?:Config)?\s=\s({.+?});\n', webpage, 'episode config')),
7308b8cb S	60	display_id)['config']['episode']
	61
	62	title = unescapeHTML(episode['title'])
	63	formats = [{
	64	'url': episode[episode_key],
	65	'ext': determine_ext(episode[episode_key]) or 'mp3',
	66	'vcodec': 'none',
7b3a19e5	67	} for episode_key in ('episodeURL',) if episode.get(episode_key)]
7308b8cb S	68	description = self._search_regex(
	69	r'Episode Info:\s*</span>([^<]+)<', webpage, 'description', fatal=False)
	70	duration = int_or_none(episode.get('duration'))
	71	thumbnail = episode.get('episodeImage')
4211c83a	72
	73	return {
	74	'id': audio_id,
7308b8cb	75	'display_id': display_id,
4211c83a	76	'title': title,
7308b8cb	77	'description': description,
4211c83a	78	'duration': duration,
7308b8cb S	79	'thumbnail': thumbnail,
7308b8cb S	80	'formats': formats,
4211c83a	81	}