yt_dlp/extractor/closertotruth.py

   1 import re
   2
   3 from .common import InfoExtractor
   4
   5
   6 class CloserToTruthIE(InfoExtractor):
   7     _VALID_URL = r'https?://(?:www\.)?closertotruth\.com/(?:[^/]+/)*(?P<id>[^/?#&]+)'
   8     _TESTS = [{
   9         'url': 'http://closertotruth.com/series/solutions-the-mind-body-problem#video-3688',
  10         'info_dict': {
  11             'id': '0_zof1ktre',
  12             'display_id': 'solutions-the-mind-body-problem',
  13             'ext': 'mov',
  14             'title': 'Solutions to the Mind-Body Problem?',
  15             'upload_date': '20140221',
  16             'timestamp': 1392956007,
  17             'uploader_id': 'CTTXML'
  18         },
  19         'params': {
  20             'skip_download': True,
  21         },
  22     }, {
  23         'url': 'http://closertotruth.com/episodes/how-do-brains-work',
  24         'info_dict': {
  25             'id': '0_iuxai6g6',
  26             'display_id': 'how-do-brains-work',
  27             'ext': 'mov',
  28             'title': 'How do Brains Work?',
  29             'upload_date': '20140221',
  30             'timestamp': 1392956024,
  31             'uploader_id': 'CTTXML'
  32         },
  33         'params': {
  34             'skip_download': True,
  35         },
  36     }, {
  37         'url': 'http://closertotruth.com/interviews/1725',
  38         'info_dict': {
  39             'id': '1725',
  40             'title': 'AyaFr-002',
  41         },
  42         'playlist_mincount': 2,
  43     }]
  44
  45     def _real_extract(self, url):
  46         display_id = self._match_id(url)
  47
  48         webpage = self._download_webpage(url, display_id)
  49
  50         partner_id = self._search_regex(
  51             r'<script[^>]+src=["\'].*?\b(?:partner_id|p)/(\d+)',
  52             webpage, 'kaltura partner_id')
  53
  54         title = self._html_extract_title(webpage, 'video title')
  55
  56         select = self._search_regex(
  57             r'(?s)<select[^>]+id="select-version"[^>]*>(.+?)</select>',
  58             webpage, 'select version', default=None)
  59         if select:
  60             entry_ids = set()
  61             entries = []
  62             for mobj in re.finditer(
  63                     r'<option[^>]+value=(["\'])(?P<id>[0-9a-z_]+)(?:#.+?)?\1[^>]*>(?P<title>[^<]+)',
  64                     webpage):
  65                 entry_id = mobj.group('id')
  66                 if entry_id in entry_ids:
  67                     continue
  68                 entry_ids.add(entry_id)
  69                 entries.append({
  70                     '_type': 'url_transparent',
  71                     'url': 'kaltura:%s:%s' % (partner_id, entry_id),
  72                     'ie_key': 'Kaltura',
  73                     'title': mobj.group('title'),
  74                 })
  75             if entries:
  76                 return self.playlist_result(entries, display_id, title)
  77
  78         entry_id = self._search_regex(
  79             r'<a[^>]+id=(["\'])embed-kaltura\1[^>]+data-kaltura=(["\'])(?P<id>[0-9a-z_]+)\2',
  80             webpage, 'kaltura entry_id', group='id')
  81
  82         return {
  83             '_type': 'url_transparent',
  84             'display_id': display_id,
  85             'url': 'kaltura:%s:%s' % (partner_id, entry_id),
  86             'ie_key': 'Kaltura',
  87             'title': title
  88         }