[yt-dlp.git] / yt_dlp / extractor / closertotruth.py

import re

from .common import InfoExtractor


class CloserToTruthIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?closertotruth\.com/(?:[^/]+/)*(?P<id>[^/?#&]+)'
    _TESTS = [{
        'url': 'http://closertotruth.com/series/solutions-the-mind-body-problem#video-3688',
        'info_dict': {
            'id': '0_zof1ktre',
            'display_id': 'solutions-the-mind-body-problem',
            'ext': 'mov',
            'title': 'Solutions to the Mind-Body Problem?',
            'upload_date': '20140221',
            'timestamp': 1392956007,
            'uploader_id': 'CTTXML'
        },
        'params': {
            'skip_download': True,
        },
    }, {
        'url': 'http://closertotruth.com/episodes/how-do-brains-work',
        'info_dict': {
            'id': '0_iuxai6g6',
            'display_id': 'how-do-brains-work',
            'ext': 'mov',
            'title': 'How do Brains Work?',
            'upload_date': '20140221',
            'timestamp': 1392956024,
            'uploader_id': 'CTTXML'
        },
        'params': {
            'skip_download': True,
        },
    }, {
        'url': 'http://closertotruth.com/interviews/1725',
        'info_dict': {
            'id': '1725',
            'title': 'AyaFr-002',
        },
        'playlist_mincount': 2,
    }]

    def _real_extract(self, url):
        display_id = self._match_id(url)

        webpage = self._download_webpage(url, display_id)

        partner_id = self._search_regex(
            r'<script[^>]+src=["\'].*?\b(?:partner_id|p)/(\d+)',
            webpage, 'kaltura partner_id')

        title = self._html_extract_title(webpage, 'video title')

        select = self._search_regex(
            r'(?s)<select[^>]+id="select-version"[^>]*>(.+?)</select>',
            webpage, 'select version', default=None)
        if select:
            entry_ids = set()
            entries = []
            for mobj in re.finditer(
                    r'<option[^>]+value=(["\'])(?P<id>[0-9a-z_]+)(?:#.+?)?\1[^>]*>(?P<title>[^<]+)',
                    webpage):
                entry_id = mobj.group('id')
                if entry_id in entry_ids:
                    continue
                entry_ids.add(entry_id)
                entries.append({
                    '_type': 'url_transparent',
                    'url': 'kaltura:%s:%s' % (partner_id, entry_id),
                    'ie_key': 'Kaltura',
                    'title': mobj.group('title'),
                })
            if entries:
                return self.playlist_result(entries, display_id, title)

        entry_id = self._search_regex(
            r'<a[^>]+id=(["\'])embed-kaltura\1[^>]+data-kaltura=(["\'])(?P<id>[0-9a-z_]+)\2',
            webpage, 'kaltura entry_id', group='id')

        return {
            '_type': 'url_transparent',
            'display_id': display_id,
            'url': 'kaltura:%s:%s' % (partner_id, entry_id),
            'ie_key': 'Kaltura',
            'title': title
        }
Commit	Line	Data
cb23192b S	1	import re
cb23192b S	2
41c10233 SG	3	from .common import InfoExtractor
	4
	5
	6	class CloserToTruthIE(InfoExtractor):
cb23192b S	7	_VALID_URL = r'https?://(?:www\.)?closertotruth\.com/(?:[^/]+/)*(?P<id>[^/?#&]+)'
	8	_TESTS = [{
	9	'url': 'http://closertotruth.com/series/solutions-the-mind-body-problem#video-3688',
	10	'info_dict': {
	11	'id': '0_zof1ktre',
	12	'display_id': 'solutions-the-mind-body-problem',
	13	'ext': 'mov',
	14	'title': 'Solutions to the Mind-Body Problem?',
	15	'upload_date': '20140221',
	16	'timestamp': 1392956007,
	17	'uploader_id': 'CTTXML'
	18	},
	19	'params': {
	20	'skip_download': True,
	21	},
	22	}, {
	23	'url': 'http://closertotruth.com/episodes/how-do-brains-work',
	24	'info_dict': {
	25	'id': '0_iuxai6g6',
	26	'display_id': 'how-do-brains-work',
	27	'ext': 'mov',
	28	'title': 'How do Brains Work?',
	29	'upload_date': '20140221',
	30	'timestamp': 1392956024,
	31	'uploader_id': 'CTTXML'
41c10233	32	},
cb23192b S	33	'params': {
cb23192b S	34	'skip_download': True,
41c10233	35	},
cb23192b S	36	}, {
	37	'url': 'http://closertotruth.com/interviews/1725',
	38	'info_dict': {
	39	'id': '1725',
	40	'title': 'AyaFr-002',
41c10233	41	},
cb23192b S	42	'playlist_mincount': 2,
cb23192b S	43	}]
41c10233 SG	44
41c10233 SG	45	def _real_extract(self, url):
cb23192b	46	display_id = self._match_id(url)
41c10233	47
cb23192b	48	webpage = self._download_webpage(url, display_id)
41c10233	49
cb23192b S	50	partner_id = self._search_regex(
	51	r'<script[^>]+src=["\'].*?\b(?:partner_id\|p)/(\d+)',
	52	webpage, 'kaltura partner_id')
41c10233	53
04f3fd2c	54	title = self._html_extract_title(webpage, 'video title')
41c10233	55
cb23192b S	56	select = self._search_regex(
	57	r'(?s)<select[^>]+id="select-version"[^>]*>(.+?)</select>',
	58	webpage, 'select version', default=None)
	59	if select:
	60	entry_ids = set()
	61	entries = []
	62	for mobj in re.finditer(
	63	r'<option[^>]+value=(["\'])(?P<id>[0-9a-z_]+)(?:#.+?)?\1[^>]*>(?P<title>[^<]+)',
	64	webpage):
	65	entry_id = mobj.group('id')
	66	if entry_id in entry_ids:
	67	continue
	68	entry_ids.add(entry_id)
	69	entries.append({
	70	'_type': 'url_transparent',
	71	'url': 'kaltura:%s:%s' % (partner_id, entry_id),
	72	'ie_key': 'Kaltura',
	73	'title': mobj.group('title'),
	74	})
	75	if entries:
	76	return self.playlist_result(entries, display_id, title)
41c10233	77
cb23192b S	78	entry_id = self._search_regex(
	79	r'<a[^>]+id=(["\'])embed-kaltura\1[^>]+data-kaltura=(["\'])(?P<id>[0-9a-z_]+)\2',
	80	webpage, 'kaltura entry_id', group='id')
41c10233 SG	81
	82	return {
	83	'_type': 'url_transparent',
cb23192b S	84	'display_id': display_id,
cb23192b S	85	'url': 'kaltura:%s:%s' % (partner_id, entry_id),
41c10233	86	'ie_key': 'Kaltura',
cb23192b	87	'title': title
41c10233	88	}