yt_dlp/extractor/rtp.py

   1 import base64
   2 import json
   3 import re
   4 import urllib.parse
   5
   6 from .common import InfoExtractor
   7 from ..utils import js_to_json
   8
   9
  10 class RTPIE(InfoExtractor):
  11     _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/p(?P<program_id>[0-9]+)/(?P<id>[^/?#]+)/?'
  12     _TESTS = [{
  13         'url': 'http://www.rtp.pt/play/p405/e174042/paixoes-cruzadas',
  14         'md5': 'e736ce0c665e459ddb818546220b4ef8',
  15         'info_dict': {
  16             'id': 'e174042',
  17             'ext': 'mp3',
  18             'title': 'Paixões Cruzadas',
  19             'description': 'As paixões musicais de António Cartaxo e António Macedo',
  20             'thumbnail': r're:^https?://.*\.jpg',
  21         },
  22     }, {
  23         'url': 'http://www.rtp.pt/play/p831/a-quimica-das-coisas',
  24         'only_matching': True,
  25     }]
  26
  27     _RX_OBFUSCATION = re.compile(r'''(?xs)
  28         atob\s*\(\s*decodeURIComponent\s*\(\s*
  29             (\[[0-9A-Za-z%,'"]*\])
  30         \s*\.\s*join\(\s*(?:""|'')\s*\)\s*\)\s*\)
  31     ''')
  32
  33     def __unobfuscate(self, data, *, video_id):
  34         if data.startswith('{'):
  35             data = self._RX_OBFUSCATION.sub(
  36                 lambda m: json.dumps(
  37                     base64.b64decode(urllib.parse.unquote(
  38                         ''.join(self._parse_json(m.group(1), video_id))
  39                     )).decode('iso-8859-1')),
  40                 data)
  41         return js_to_json(data)
  42
  43     def _real_extract(self, url):
  44         video_id = self._match_id(url)
  45
  46         webpage = self._download_webpage(url, video_id)
  47         title = self._html_search_meta(
  48             'twitter:title', webpage, display_name='title', fatal=True)
  49
  50         f, config = self._search_regex(
  51             r'''(?sx)
  52                 var\s+f\s*=\s*(?P<f>".*?"|{[^;]+?});\s*
  53                 var\s+player1\s+=\s+new\s+RTPPlayer\s*\((?P<config>{(?:(?!\*/).)+?})\);(?!\s*\*/)
  54             ''', webpage,
  55             'player config', group=('f', 'config'))
  56
  57         f = self._parse_json(
  58             f, video_id,
  59             lambda data: self.__unobfuscate(data, video_id=video_id))
  60         config = self._parse_json(
  61             config, video_id,
  62             lambda data: self.__unobfuscate(data, video_id=video_id))
  63
  64         formats = []
  65         if isinstance(f, dict):
  66             f_hls = f.get('hls')
  67             if f_hls is not None:
  68                 formats.extend(self._extract_m3u8_formats(
  69                     f_hls, video_id, 'mp4', 'm3u8_native', m3u8_id='hls'))
  70
  71             f_dash = f.get('dash')
  72             if f_dash is not None:
  73                 formats.extend(self._extract_mpd_formats(f_dash, video_id, mpd_id='dash'))
  74         else:
  75             formats.append({
  76                 'format_id': 'f',
  77                 'url': f,
  78                 'vcodec': 'none' if config.get('mediaType') == 'audio' else None,
  79             })
  80
  81         subtitles = {}
  82
  83         vtt = config.get('vtt')
  84         if vtt is not None:
  85             for lcode, lname, url in vtt:
  86                 subtitles.setdefault(lcode, []).append({
  87                     'name': lname,
  88                     'url': url,
  89                 })
  90
  91         return {
  92             'id': video_id,
  93             'title': title,
  94             'formats': formats,
  95             'description': self._html_search_meta(['description', 'twitter:description'], webpage),
  96             'thumbnail': config.get('poster') or self._og_search_thumbnail(webpage),
  97             'subtitles': subtitles,
  98         }