X-Git-Url: https://jfr.im/git/yt-dlp.git/blobdiff_plain/5ad28e7ffd41deccba33776c1609aa7855601739..61edf57f8f13f6dfd81154174e647eb5fdd26089:/yt_dlp/extractor/rtve.py diff --git a/yt_dlp/extractor/rtve.py b/yt_dlp/extractor/rtve.py index 59832eeac..7e0b666ab 100644 --- a/yt_dlp/extractor/rtve.py +++ b/yt_dlp/extractor/rtve.py @@ -1,27 +1,18 @@ -# coding: utf-8 -from __future__ import unicode_literals - import base64 import io -import sys +import struct from .common import InfoExtractor -from ..compat import ( - compat_b64decode, - compat_struct_unpack, -) from ..utils import ( - determine_ext, ExtractorError, + determine_ext, float_or_none, qualities, remove_end, remove_start, - std_headers, + try_get, ) -_bytes_to_chr = (lambda x: x) if sys.version_info[0] == 2 else (lambda x: map(chr, x)) - class RTVEALaCartaIE(InfoExtractor): IE_NAME = 'rtve.es:alacarta' @@ -70,16 +61,16 @@ class RTVEALaCartaIE(InfoExtractor): }] def _real_initialize(self): - user_agent_b64 = base64.b64encode(std_headers['User-Agent'].encode('utf-8')).decode('utf-8') + user_agent_b64 = base64.b64encode(self.get_param('http_headers')['User-Agent'].encode()).decode('utf-8') self._manager = self._download_json( 'http://www.rtve.es/odin/loki/' + user_agent_b64, None, 'Fetching manager info')['manager'] @staticmethod def _decrypt_url(png): - encrypted_data = io.BytesIO(compat_b64decode(png)[8:]) + encrypted_data = io.BytesIO(base64.b64decode(png)[8:]) while True: - length = compat_struct_unpack('!I', encrypted_data.read(4))[0] + length = struct.unpack('!I', encrypted_data.read(4))[0] chunk_type = encrypted_data.read(4) if chunk_type == b'IEND': break @@ -90,7 +81,7 @@ def _decrypt_url(png): alphabet = [] e = 0 d = 0 - for l in _bytes_to_chr(alphabet_data): + for l in alphabet_data.decode('iso-8859-1'): if d == 0: alphabet.append(l) d = e = (e + 1) % 4 @@ -100,7 +91,7 @@ def _decrypt_url(png): f = 0 e = 3 b = 1 - for letter in _bytes_to_chr(url_data): + for letter in url_data.decode('iso-8859-1'): if f == 0: l = int(letter) * 10 f = 1 @@ -119,7 +110,7 @@ def _decrypt_url(png): def _extract_png_formats(self, video_id): png = self._download_webpage( - 'http://www.rtve.es/ztnr/movil/thumbnail/%s/videos/%s.png' % (self._manager, video_id), + f'http://www.rtve.es/ztnr/movil/thumbnail/{self._manager}/videos/{video_id}.png', video_id, 'Downloading url information', query={'q': 'v2'}) q = qualities(['Media', 'Alta', 'HQ', 'HD_READY', 'HD_FULL']) formats = [] @@ -138,13 +129,12 @@ def _extract_png_formats(self, video_id): 'quality': q(quality), 'url': video_url, }) - self._sort_formats(formats) return formats def _real_extract(self, url): video_id = self._match_id(url) info = self._download_json( - 'http://www.rtve.es/api/videos/%s/config/alacarta_videos.json' % video_id, + f'http://www.rtve.es/api/videos/{video_id}/config/alacarta_videos.json', video_id)['page']['items'][0] if info['state'] == 'DESPU': raise ExtractorError('The video is no longer available', expected=True) @@ -160,7 +150,7 @@ def _real_extract(self, url): return { 'id': video_id, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'formats': formats, 'thumbnail': info.get('image'), 'subtitles': subtitles, @@ -178,7 +168,92 @@ def _get_subtitles(self, video_id, sub_file): for s in subs) -class RTVEInfantilIE(RTVEALaCartaIE): +class RTVEAudioIE(RTVEALaCartaIE): # XXX: Do not subclass from concrete IE + IE_NAME = 'rtve.es:audio' + IE_DESC = 'RTVE audio' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/(alacarta|play)/audios/[^/]+/[^/]+/(?P[0-9]+)' + + _TESTS = [{ + 'url': 'https://www.rtve.es/alacarta/audios/a-hombros-de-gigantes/palabra-ingeniero-codigos-informaticos-27-04-21/5889192/', + 'md5': 'ae06d27bff945c4e87a50f89f6ce48ce', + 'info_dict': { + 'id': '5889192', + 'ext': 'mp3', + 'title': 'Códigos informáticos', + 'thumbnail': r're:https?://.+/1598856591583.jpg', + 'duration': 349.440, + 'series': 'A hombros de gigantes', + }, + }, { + 'url': 'https://www.rtve.es/play/audios/en-radio-3/ignatius-farray/5791165/', + 'md5': '072855ab89a9450e0ba314c717fa5ebc', + 'info_dict': { + 'id': '5791165', + 'ext': 'mp3', + 'title': 'Ignatius Farray', + 'thumbnail': r're:https?://.+/1613243011863.jpg', + 'duration': 3559.559, + 'series': 'En Radio 3', + }, + }, { + 'url': 'https://www.rtve.es/play/audios/frankenstein-o-el-moderno-prometeo/capitulo-26-ultimo-muerte-victor-juan-jose-plans-mary-shelley/6082623/', + 'md5': '0eadab248cc8dd193fa5765712e84d5c', + 'info_dict': { + 'id': '6082623', + 'ext': 'mp3', + 'title': 'Capítulo 26 y último: La muerte de Victor', + 'thumbnail': r're:https?://.+/1632147445707.jpg', + 'duration': 3174.086, + 'series': 'Frankenstein o el moderno Prometeo', + }, + }] + + def _extract_png_formats(self, audio_id): + """ + This function retrieves media related png thumbnail which obfuscate + valuable information about the media. This information is decrypted + via base class _decrypt_url function providing media quality and + media url + """ + png = self._download_webpage( + f'http://www.rtve.es/ztnr/movil/thumbnail/{self._manager}/audios/{audio_id}.png', + audio_id, 'Downloading url information', query={'q': 'v2'}) + q = qualities(['Media', 'Alta', 'HQ', 'HD_READY', 'HD_FULL']) + formats = [] + for quality, audio_url in self._decrypt_url(png): + ext = determine_ext(audio_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + audio_url, audio_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + audio_url, audio_id, 'dash', fatal=False)) + else: + formats.append({ + 'format_id': quality, + 'quality': q(quality), + 'url': audio_url, + }) + return formats + + def _real_extract(self, url): + audio_id = self._match_id(url) + info = self._download_json( + f'https://www.rtve.es/api/audios/{audio_id}.json', + audio_id)['page']['items'][0] + + return { + 'id': audio_id, + 'title': info['title'].strip(), + 'thumbnail': info.get('thumbnail'), + 'duration': float_or_none(info.get('duration'), 1000), + 'series': try_get(info, lambda x: x['programInfo']['title']), + 'formats': self._extract_png_formats(audio_id), + } + + +class RTVEInfantilIE(RTVEALaCartaIE): # XXX: Do not subclass from concrete IE IE_NAME = 'rtve.es:infantil' IE_DESC = 'RTVE infantil' _VALID_URL = r'https?://(?:www\.)?rtve\.es/infantil/serie/[^/]+/video/[^/]+/(?P[0-9]+)/' @@ -197,7 +272,7 @@ class RTVEInfantilIE(RTVEALaCartaIE): }] -class RTVELiveIE(RTVEALaCartaIE): +class RTVELiveIE(RTVEALaCartaIE): # XXX: Do not subclass from concrete IE IE_NAME = 'rtve.es:live' IE_DESC = 'RTVE.es live streams' _VALID_URL = r'https?://(?:www\.)?rtve\.es/directo/(?P[a-zA-Z0-9-]+)' @@ -211,7 +286,7 @@ class RTVELiveIE(RTVEALaCartaIE): }, 'params': { 'skip_download': 'live stream', - } + }, }] def _real_extract(self, url): @@ -230,7 +305,7 @@ def _real_extract(self, url): return { 'id': video_id, - 'title': self._live_title(title), + 'title': title, 'formats': self._extract_png_formats(vidplayer_id), 'is_live': True, }