yt_dlp/extractor/closertotruth.py

   1 import re
   2
   3 from .common import InfoExtractor
   4
   5
   6 class CloserToTruthIE(InfoExtractor):
   7     _WORKING = False
   8     _VALID_URL = r'https?://(?:www\.)?closertotruth\.com/(?:[^/]+/)*(?P<id>[^/?#&]+)'
   9     _TESTS = [{
  10         'url': 'http://closertotruth.com/series/solutions-the-mind-body-problem#video-3688',
  11         'info_dict': {
  12             'id': '0_zof1ktre',
  13             'display_id': 'solutions-the-mind-body-problem',
  14             'ext': 'mov',
  15             'title': 'Solutions to the Mind-Body Problem?',
  16             'upload_date': '20140221',
  17             'timestamp': 1392956007,
  18             'uploader_id': 'CTTXML',
  19         },
  20         'params': {
  21             'skip_download': True,
  22         },
  23     }, {
  24         'url': 'http://closertotruth.com/episodes/how-do-brains-work',
  25         'info_dict': {
  26             'id': '0_iuxai6g6',
  27             'display_id': 'how-do-brains-work',
  28             'ext': 'mov',
  29             'title': 'How do Brains Work?',
  30             'upload_date': '20140221',
  31             'timestamp': 1392956024,
  32             'uploader_id': 'CTTXML',
  33         },
  34         'params': {
  35             'skip_download': True,
  36         },
  37     }, {
  38         'url': 'http://closertotruth.com/interviews/1725',
  39         'info_dict': {
  40             'id': '1725',
  41             'title': 'AyaFr-002',
  42         },
  43         'playlist_mincount': 2,
  44     }]
  45
  46     def _real_extract(self, url):
  47         display_id = self._match_id(url)
  48
  49         webpage = self._download_webpage(url, display_id)
  50
  51         partner_id = self._search_regex(
  52             r'<script[^>]+src=["\'].*?\b(?:partner_id|p)/(\d+)',
  53             webpage, 'kaltura partner_id')
  54
  55         title = self._html_extract_title(webpage, 'video title')
  56
  57         select = self._search_regex(
  58             r'(?s)<select[^>]+id="select-version"[^>]*>(.+?)</select>',
  59             webpage, 'select version', default=None)
  60         if select:
  61             entry_ids = set()
  62             entries = []
  63             for mobj in re.finditer(
  64                     r'<option[^>]+value=(["\'])(?P<id>[0-9a-z_]+)(?:#.+?)?\1[^>]*>(?P<title>[^<]+)',
  65                     webpage):
  66                 entry_id = mobj.group('id')
  67                 if entry_id in entry_ids:
  68                     continue
  69                 entry_ids.add(entry_id)
  70                 entries.append({
  71                     '_type': 'url_transparent',
  72                     'url': f'kaltura:{partner_id}:{entry_id}',
  73                     'ie_key': 'Kaltura',
  74                     'title': mobj.group('title'),
  75                 })
  76             if entries:
  77                 return self.playlist_result(entries, display_id, title)
  78
  79         entry_id = self._search_regex(
  80             r'<a[^>]+id=(["\'])embed-kaltura\1[^>]+data-kaltura=(["\'])(?P<id>[0-9a-z_]+)\2',
  81             webpage, 'kaltura entry_id', group='id')
  82
  83         return {
  84             '_type': 'url_transparent',
  85             'display_id': display_id,
  86             'url': f'kaltura:{partner_id}:{entry_id}',
  87             'ie_key': 'Kaltura',
  88             'title': title,
  89         }