]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/rtp.py
[misc] Add `hatch`, `ruff`, `pre-commit` and improve dev docs (#7409)
[yt-dlp.git] / yt_dlp / extractor / rtp.py
1 import base64
2 import json
3 import re
4 import urllib.parse
5
6 from .common import InfoExtractor
7 from ..utils import js_to_json
8
9
10 class RTPIE(InfoExtractor):
11 _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/p(?P<program_id>[0-9]+)/(?P<id>[^/?#]+)/?'
12 _TESTS = [{
13 'url': 'http://www.rtp.pt/play/p405/e174042/paixoes-cruzadas',
14 'md5': 'e736ce0c665e459ddb818546220b4ef8',
15 'info_dict': {
16 'id': 'e174042',
17 'ext': 'mp3',
18 'title': 'Paixões Cruzadas',
19 'description': 'As paixões musicais de António Cartaxo e António Macedo',
20 'thumbnail': r're:^https?://.*\.jpg',
21 },
22 }, {
23 'url': 'http://www.rtp.pt/play/p831/a-quimica-das-coisas',
24 'only_matching': True,
25 }]
26
27 _RX_OBFUSCATION = re.compile(r'''(?xs)
28 atob\s*\(\s*decodeURIComponent\s*\(\s*
29 (\[[0-9A-Za-z%,'"]*\])
30 \s*\.\s*join\(\s*(?:""|'')\s*\)\s*\)\s*\)
31 ''')
32
33 def __unobfuscate(self, data, *, video_id):
34 if data.startswith('{'):
35 data = self._RX_OBFUSCATION.sub(
36 lambda m: json.dumps(
37 base64.b64decode(urllib.parse.unquote(
38 ''.join(self._parse_json(m.group(1), video_id))
39 )).decode('iso-8859-1')),
40 data)
41 return js_to_json(data)
42
43 def _real_extract(self, url):
44 video_id = self._match_id(url)
45
46 webpage = self._download_webpage(url, video_id)
47 title = self._html_search_meta(
48 'twitter:title', webpage, display_name='title', fatal=True)
49
50 f, config = self._search_regex(
51 r'''(?sx)
52 var\s+f\s*=\s*(?P<f>".*?"|{[^;]+?});\s*
53 var\s+player1\s+=\s+new\s+RTPPlayer\s*\((?P<config>{(?:(?!\*/).)+?})\);(?!\s*\*/)
54 ''', webpage,
55 'player config', group=('f', 'config'))
56
57 f = self._parse_json(
58 f, video_id,
59 lambda data: self.__unobfuscate(data, video_id=video_id))
60 config = self._parse_json(
61 config, video_id,
62 lambda data: self.__unobfuscate(data, video_id=video_id))
63
64 formats = []
65 if isinstance(f, dict):
66 f_hls = f.get('hls')
67 if f_hls is not None:
68 formats.extend(self._extract_m3u8_formats(
69 f_hls, video_id, 'mp4', 'm3u8_native', m3u8_id='hls'))
70
71 f_dash = f.get('dash')
72 if f_dash is not None:
73 formats.extend(self._extract_mpd_formats(f_dash, video_id, mpd_id='dash'))
74 else:
75 formats.append({
76 'format_id': 'f',
77 'url': f,
78 'vcodec': 'none' if config.get('mediaType') == 'audio' else None,
79 })
80
81 subtitles = {}
82
83 vtt = config.get('vtt')
84 if vtt is not None:
85 for lcode, lname, url in vtt:
86 subtitles.setdefault(lcode, []).append({
87 'name': lname,
88 'url': url,
89 })
90
91 return {
92 'id': video_id,
93 'title': title,
94 'formats': formats,
95 'description': self._html_search_meta(['description', 'twitter:description'], webpage),
96 'thumbnail': config.get('poster') or self._og_search_thumbnail(webpage),
97 'subtitles': subtitles,
98 }