]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/rtp.py
[LnkIE] Add extractor (#2408)
[yt-dlp.git] / yt_dlp / extractor / rtp.py
CommitLineData
c3f3b29b
NJ
1# coding: utf-8
2from __future__ import unicode_literals
3
c3f3b29b 4from .common import InfoExtractor
182b6ae8
F
5from ..utils import js_to_json
6import re
7import json
8import urllib.parse
9import base64
c3f3b29b
NJ
10
11
12class RTPIE(InfoExtractor):
bad5c1a3
PH
13 _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/p(?P<program_id>[0-9]+)/(?P<id>[^/?#]+)/?'
14 _TESTS = [{
c3f3b29b 15 'url': 'http://www.rtp.pt/play/p405/e174042/paixoes-cruzadas',
ad5747ba 16 'md5': 'e736ce0c665e459ddb818546220b4ef8',
c3f3b29b 17 'info_dict': {
a86cbf58 18 'id': 'e174042',
c3f3b29b
NJ
19 'ext': 'mp3',
20 'title': 'Paixões Cruzadas',
21 'description': 'As paixões musicais de António Cartaxo e António Macedo',
ec85ded8 22 'thumbnail': r're:^https?://.*\.jpg',
c3f3b29b 23 },
bad5c1a3
PH
24 }, {
25 'url': 'http://www.rtp.pt/play/p831/a-quimica-das-coisas',
26 'only_matching': True,
27 }]
c3f3b29b 28
182b6ae8
F
29 _RX_OBFUSCATION = re.compile(r'''(?xs)
30 atob\s*\(\s*decodeURIComponent\s*\(\s*
31 (\[[0-9A-Za-z%,'"]*\])
32 \s*\.\s*join\(\s*(?:""|'')\s*\)\s*\)\s*\)
33 ''')
34
35 def __unobfuscate(self, data, *, video_id):
36 if data.startswith('{'):
37 data = self._RX_OBFUSCATION.sub(
38 lambda m: json.dumps(
39 base64.b64decode(urllib.parse.unquote(
40 ''.join(self._parse_json(m.group(1), video_id))
41 )).decode('iso-8859-1')),
42 data)
43 return js_to_json(data)
44
c3f3b29b
NJ
45 def _real_extract(self, url):
46 video_id = self._match_id(url)
47
48 webpage = self._download_webpage(url, video_id)
49 title = self._html_search_meta(
50 'twitter:title', webpage, display_name='title', fatal=True)
ad5747ba 51
182b6ae8
F
52 f, config = self._search_regex(
53 r'''(?sx)
54 var\s+f\s*=\s*(?P<f>".*?"|{[^;]+?});\s*
55 var\s+player1\s+=\s+new\s+RTPPlayer\s*\((?P<config>{(?:(?!\*/).)+?})\);(?!\s*\*/)
56 ''', webpage,
57 'player config', group=('f', 'config'))
58
59 f = self._parse_json(
60 f, video_id,
61 lambda data: self.__unobfuscate(data, video_id=video_id))
62 config = self._parse_json(
63 config, video_id,
64 lambda data: self.__unobfuscate(data, video_id=video_id))
65
66 formats = []
67 if isinstance(f, dict):
68 f_hls = f.get('hls')
69 if f_hls is not None:
70 formats.extend(self._extract_m3u8_formats(
71 f_hls, video_id, 'mp4', 'm3u8_native', m3u8_id='hls'))
72
73 f_dash = f.get('dash')
74 if f_dash is not None:
75 formats.extend(self._extract_mpd_formats(f_dash, video_id, mpd_id='dash'))
ead467a9 76 else:
182b6ae8
F
77 formats.append({
78 'format_id': 'f',
79 'url': f,
80 'vcodec': 'none' if config.get('mediaType') == 'audio' else None,
81 })
82
83 subtitles = {}
84
85 vtt = config.get('vtt')
86 if vtt is not None:
87 for lcode, lname, url in vtt:
88 subtitles.setdefault(lcode, []).append({
89 'name': lname,
90 'url': url,
91 })
ad5747ba 92
c3f3b29b
NJ
93 return {
94 'id': video_id,
95 'title': title,
96 'formats': formats,
ead467a9
RA
97 'description': self._html_search_meta(['description', 'twitter:description'], webpage),
98 'thumbnail': config.get('poster') or self._og_search_thumbnail(webpage),
182b6ae8 99 'subtitles': subtitles,
c3f3b29b 100 }