[yt-dlp.git] / yt_dlp / extractor / webofstories.py

import re

from .common import InfoExtractor
from ..utils import (
    int_or_none,
    orderedSet,
)


class WebOfStoriesIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?webofstories\.com/play/(?:[^/]+/)?(?P<id>[0-9]+)'
    _VIDEO_DOMAIN = 'http://eu-mobile.webofstories.com/'
    _GREAT_LIFE_STREAMER = 'rtmp://eu-cdn1.webofstories.com/cfx/st/'
    _USER_STREAMER = 'rtmp://eu-users.webofstories.com/cfx/st/'
    _TESTS = [{
        'url': 'http://www.webofstories.com/play/hans.bethe/71',
        'md5': '373e4dd915f60cfe3116322642ddf364',
        'info_dict': {
            'id': '4536',
            'ext': 'mp4',
            'title': 'The temperature of the sun',
            'thumbnail': r're:^https?://.*\.jpg$',
            'description': 'Hans Bethe talks about calculating the temperature of the sun',
            'duration': 238,
        },
    }, {
        'url': 'http://www.webofstories.com/play/55908',
        'md5': '2985a698e1fe3211022422c4b5ed962c',
        'info_dict': {
            'id': '55908',
            'ext': 'mp4',
            'title': 'The story of Gemmata obscuriglobus',
            'thumbnail': r're:^https?://.*\.jpg$',
            'description': 'Planctomycete talks about The story of Gemmata obscuriglobus',
            'duration': 169,
        },
        'skip': 'notfound',
    }, {
        # malformed og:title meta
        'url': 'http://www.webofstories.com/play/54215?o=MS',
        'info_dict': {
            'id': '54215',
            'ext': 'mp4',
            'title': '"A Leg to Stand On"',
            'thumbnail': r're:^https?://.*\.jpg$',
            'description': 'Oliver Sacks talks about the death and resurrection of a limb',
            'duration': 97,
        },
        'params': {
            'skip_download': True,
        },
    }]

    def _real_extract(self, url):
        video_id = self._match_id(url)

        webpage = self._download_webpage(url, video_id)
        # Sometimes og:title meta is malformed
        title = self._og_search_title(webpage, default=None) or self._html_search_regex(
            r'(?s)<strong>Title:\s*</strong>(.+?)<', webpage, 'title')
        description = self._html_search_meta('description', webpage)
        thumbnail = self._og_search_thumbnail(webpage)

        embed_params = [s.strip(" \r\n\t'") for s in self._search_regex(
            r'(?s)\$\("#embedCode"\).html\(getEmbedCode\((.*?)\)',
            webpage, 'embed params').split(',')]

        (
            _, speaker_id, story_id, story_duration,
            speaker_type, great_life, _thumbnail, _has_subtitles,
            story_filename, _story_order) = embed_params

        is_great_life_series = great_life == 'true'
        duration = int_or_none(story_duration)

        # URL building, see: http://www.webofstories.com/scripts/player.js
        ms_prefix = ''
        if speaker_type.lower() == 'ms':
            ms_prefix = 'mini_sites/'

        if is_great_life_series:
            mp4_url = f'{self._VIDEO_DOMAIN}lives/{speaker_id}/{story_filename}.mp4'
            rtmp_ext = 'flv'
            streamer = self._GREAT_LIFE_STREAMER
            play_path = f'stories/{speaker_id}/{story_filename}'
        else:
            mp4_url = f'{self._VIDEO_DOMAIN}{ms_prefix}{speaker_id}/{story_filename}.mp4'
            rtmp_ext = 'mp4'
            streamer = self._USER_STREAMER
            play_path = f'mp4:{ms_prefix}{speaker_id}/{story_filename}.mp4'

        formats = [{
            'format_id': 'mp4_sd',
            'url': mp4_url,
        }, {
            'format_id': 'rtmp_sd',
            'page_url': url,
            'url': streamer,
            'ext': rtmp_ext,
            'play_path': play_path,
        }]

        return {
            'id': story_id,
            'title': title,
            'formats': formats,
            'thumbnail': thumbnail,
            'description': description,
            'duration': duration,
        }


class WebOfStoriesPlaylistIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?webofstories\.com/playAll/(?P<id>[^/]+)'
    _TEST = {
        'url': 'http://www.webofstories.com/playAll/donald.knuth',
        'info_dict': {
            'id': 'donald.knuth',
            'title': 'Donald Knuth (Scientist)',
        },
        'playlist_mincount': 97,
    }

    def _real_extract(self, url):
        playlist_id = self._match_id(url)

        webpage = self._download_webpage(url, playlist_id)

        entries = [
            self.url_result(
                f'http://www.webofstories.com/play/{video_id}',
                'WebOfStories', video_id=video_id)
            for video_id in orderedSet(re.findall(r'\bid=["\']td_(\d+)', webpage))
        ]

        title = self._search_regex(
            r'<div id="speakerName">\s*<span>([^<]+)</span>',
            webpage, 'speaker', default=None)
        if title:
            field = self._search_regex(
                r'<span id="primaryField">([^<]+)</span>',
                webpage, 'field', default=None)
            if field:
                title += f' ({field})'

        if not title:
            title = self._search_regex(
                r'<title>Play\s+all\s+stories\s*-\s*([^<]+)\s*-\s*Web\s+of\s+Stories</title>',
                webpage, 'title')

        return self.playlist_result(entries, playlist_id, title)
Commit	Line	Data
2028c6e0 SZ	1	import re
2028c6e0 SZ	2
caf90bfa	3	from .common import InfoExtractor
dd88fd65 S	4	from ..utils import (
	5	int_or_none,
	6	orderedSet,
	7	)
caf90bfa NJ	8
	9
	10	class WebOfStoriesIE(InfoExtractor):
	11	_VALID_URL = r'https?://(?:www\.)?webofstories\.com/play/(?:[^/]+/)?(?P<id>[0-9]+)'
	12	_VIDEO_DOMAIN = 'http://eu-mobile.webofstories.com/'
	13	_GREAT_LIFE_STREAMER = 'rtmp://eu-cdn1.webofstories.com/cfx/st/'
	14	_USER_STREAMER = 'rtmp://eu-users.webofstories.com/cfx/st/'
8870bb46 S	15	_TESTS = [{
	16	'url': 'http://www.webofstories.com/play/hans.bethe/71',
	17	'md5': '373e4dd915f60cfe3116322642ddf364',
	18	'info_dict': {
	19	'id': '4536',
	20	'ext': 'mp4',
	21	'title': 'The temperature of the sun',
ec85ded8	22	'thumbnail': r're:^https?://.*\.jpg$',
8870bb46 S	23	'description': 'Hans Bethe talks about calculating the temperature of the sun',
8870bb46 S	24	'duration': 238,
add96eb9	25	},
8870bb46 S	26	}, {
	27	'url': 'http://www.webofstories.com/play/55908',
	28	'md5': '2985a698e1fe3211022422c4b5ed962c',
	29	'info_dict': {
	30	'id': '55908',
	31	'ext': 'mp4',
	32	'title': 'The story of Gemmata obscuriglobus',
ec85ded8	33	'thumbnail': r're:^https?://.*\.jpg$',
8870bb46 S	34	'description': 'Planctomycete talks about The story of Gemmata obscuriglobus',
8870bb46 S	35	'duration': 169,
caf90bfa	36	},
8870bb46 S	37	'skip': 'notfound',
	38	}, {
	39	# malformed og:title meta
	40	'url': 'http://www.webofstories.com/play/54215?o=MS',
	41	'info_dict': {
	42	'id': '54215',
	43	'ext': 'mp4',
	44	'title': '"A Leg to Stand On"',
ec85ded8	45	'thumbnail': r're:^https?://.*\.jpg$',
8870bb46 S	46	'description': 'Oliver Sacks talks about the death and resurrection of a limb',
8870bb46 S	47	'duration': 97,
caf90bfa	48	},
8870bb46 S	49	'params': {
	50	'skip_download': True,
	51	},
	52	}]
caf90bfa NJ	53
	54	def _real_extract(self, url):
	55	video_id = self._match_id(url)
	56
	57	webpage = self._download_webpage(url, video_id)
8870bb46 S	58	# Sometimes og:title meta is malformed
	59	title = self._og_search_title(webpage, default=None) or self._html_search_regex(
	60	r'(?s)<strong>Title:\s*</strong>(.+?)<', webpage, 'title')
caf90bfa NJ	61	description = self._html_search_meta('description', webpage)
	62	thumbnail = self._og_search_thumbnail(webpage)
	63
3d547884 PH	64	embed_params = [s.strip(" \r\n\t'") for s in self._search_regex(
	65	r'(?s)\$\("#embedCode"\).html\(getEmbedCode\((.*?)\)',
	66	webpage, 'embed params').split(',')]
	67
	68	(
	69	_, speaker_id, story_id, story_duration,
	70	speaker_type, great_life, _thumbnail, _has_subtitles,
	71	story_filename, _story_order) = embed_params
	72
caf90bfa	73	is_great_life_series = great_life == 'true'
3d547884	74	duration = int_or_none(story_duration)
caf90bfa NJ	75
	76	# URL building, see: http://www.webofstories.com/scripts/player.js
	77	ms_prefix = ''
	78	if speaker_type.lower() == 'ms':
	79	ms_prefix = 'mini_sites/'
	80
	81	if is_great_life_series:
add96eb9	82	mp4_url = f'{self._VIDEO_DOMAIN}lives/{speaker_id}/{story_filename}.mp4'
caf90bfa NJ	83	rtmp_ext = 'flv'
caf90bfa NJ	84	streamer = self._GREAT_LIFE_STREAMER
add96eb9	85	play_path = f'stories/{speaker_id}/{story_filename}'
caf90bfa	86	else:
add96eb9	87	mp4_url = f'{self._VIDEO_DOMAIN}{ms_prefix}{speaker_id}/{story_filename}.mp4'
caf90bfa NJ	88	rtmp_ext = 'mp4'
caf90bfa NJ	89	streamer = self._USER_STREAMER
add96eb9	90	play_path = f'mp4:{ms_prefix}{speaker_id}/{story_filename}.mp4'
caf90bfa NJ	91
	92	formats = [{
	93	'format_id': 'mp4_sd',
	94	'url': mp4_url,
	95	}, {
	96	'format_id': 'rtmp_sd',
	97	'page_url': url,
	98	'url': streamer,
	99	'ext': rtmp_ext,
	100	'play_path': play_path,
	101	}]
	102
caf90bfa NJ	103	return {
	104	'id': story_id,
	105	'title': title,
	106	'formats': formats,
	107	'thumbnail': thumbnail,
	108	'description': description,
	109	'duration': duration,
	110	}
2028c6e0 SZ	111
	112
	113	class WebOfStoriesPlaylistIE(InfoExtractor):
	114	_VALID_URL = r'https?://(?:www\.)?webofstories\.com/playAll/(?P<id>[^/]+)'
fac54cb4 S	115	_TEST = {
	116	'url': 'http://www.webofstories.com/playAll/donald.knuth',
	117	'info_dict': {
	118	'id': 'donald.knuth',
	119	'title': 'Donald Knuth (Scientist)',
	120	},
	121	'playlist_mincount': 97,
	122	}
2028c6e0 SZ	123
	124	def _real_extract(self, url):
	125	playlist_id = self._match_id(url)
	126
	127	webpage = self._download_webpage(url, playlist_id)
	128
	129	entries = [
dd88fd65	130	self.url_result(
add96eb9	131	f'http://www.webofstories.com/play/{video_id}',
dd88fd65 S	132	'WebOfStories', video_id=video_id)
dd88fd65 S	133	for video_id in orderedSet(re.findall(r'\bid=["\']td_(\d+)', webpage))
2028c6e0 SZ	134	]
2028c6e0 SZ	135
fac54cb4 S	136	title = self._search_regex(
	137	r'<div id="speakerName">\s*<span>([^<]+)</span>',
	138	webpage, 'speaker', default=None)
	139	if title:
	140	field = self._search_regex(
	141	r'<span id="primaryField">([^<]+)</span>',
	142	webpage, 'field', default=None)
	143	if field:
add96eb9	144	title += f' ({field})'
fac54cb4 S	145
	146	if not title:
	147	title = self._search_regex(
	148	r'<title>Play\s+all\s+stories\s-\s([^<]+)\s-\sWeb\s+of\s+Stories</title>',
	149	webpage, 'title')
	150
	151	return self.playlist_result(entries, playlist_id, title)