yt_dlp/extractor/closertotruth.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import re
   5
   6 from .common import InfoExtractor
   7
   8
   9 class CloserToTruthIE(InfoExtractor):
  10     _VALID_URL = r'https?://(?:www\.)?closertotruth\.com/(?:[^/]+/)*(?P<id>[^/?#&]+)'
  11     _TESTS = [{
  12         'url': 'http://closertotruth.com/series/solutions-the-mind-body-problem#video-3688',
  13         'info_dict': {
  14             'id': '0_zof1ktre',
  15             'display_id': 'solutions-the-mind-body-problem',
  16             'ext': 'mov',
  17             'title': 'Solutions to the Mind-Body Problem?',
  18             'upload_date': '20140221',
  19             'timestamp': 1392956007,
  20             'uploader_id': 'CTTXML'
  21         },
  22         'params': {
  23             'skip_download': True,
  24         },
  25     }, {
  26         'url': 'http://closertotruth.com/episodes/how-do-brains-work',
  27         'info_dict': {
  28             'id': '0_iuxai6g6',
  29             'display_id': 'how-do-brains-work',
  30             'ext': 'mov',
  31             'title': 'How do Brains Work?',
  32             'upload_date': '20140221',
  33             'timestamp': 1392956024,
  34             'uploader_id': 'CTTXML'
  35         },
  36         'params': {
  37             'skip_download': True,
  38         },
  39     }, {
  40         'url': 'http://closertotruth.com/interviews/1725',
  41         'info_dict': {
  42             'id': '1725',
  43             'title': 'AyaFr-002',
  44         },
  45         'playlist_mincount': 2,
  46     }]
  47
  48     def _real_extract(self, url):
  49         display_id = self._match_id(url)
  50
  51         webpage = self._download_webpage(url, display_id)
  52
  53         partner_id = self._search_regex(
  54             r'<script[^>]+src=["\'].*?\b(?:partner_id|p)/(\d+)',
  55             webpage, 'kaltura partner_id')
  56
  57         title = self._html_extract_title(webpage, 'video title')
  58
  59         select = self._search_regex(
  60             r'(?s)<select[^>]+id="select-version"[^>]*>(.+?)</select>',
  61             webpage, 'select version', default=None)
  62         if select:
  63             entry_ids = set()
  64             entries = []
  65             for mobj in re.finditer(
  66                     r'<option[^>]+value=(["\'])(?P<id>[0-9a-z_]+)(?:#.+?)?\1[^>]*>(?P<title>[^<]+)',
  67                     webpage):
  68                 entry_id = mobj.group('id')
  69                 if entry_id in entry_ids:
  70                     continue
  71                 entry_ids.add(entry_id)
  72                 entries.append({
  73                     '_type': 'url_transparent',
  74                     'url': 'kaltura:%s:%s' % (partner_id, entry_id),
  75                     'ie_key': 'Kaltura',
  76                     'title': mobj.group('title'),
  77                 })
  78             if entries:
  79                 return self.playlist_result(entries, display_id, title)
  80
  81         entry_id = self._search_regex(
  82             r'<a[^>]+id=(["\'])embed-kaltura\1[^>]+data-kaltura=(["\'])(?P<id>[0-9a-z_]+)\2',
  83             webpage, 'kaltura entry_id', group='id')
  84
  85         return {
  86             '_type': 'url_transparent',
  87             'display_id': display_id,
  88             'url': 'kaltura:%s:%s' % (partner_id, entry_id),
  89             'ie_key': 'Kaltura',
  90             'title': title
  91         }