]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/globo.py
[fragments] Pad fragments before decrypting (#1298)
[yt-dlp.git] / yt_dlp / extractor / globo.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import base64
5 import hashlib
6 import json
7 import random
8 import re
9
10 from .common import InfoExtractor
11 from ..compat import (
12 compat_str,
13 )
14 from ..utils import (
15 ExtractorError,
16 float_or_none,
17 orderedSet,
18 str_or_none,
19 try_get,
20 )
21
22
23 class GloboIE(InfoExtractor):
24 _VALID_URL = r'(?:globo:|https?://.+?\.globo\.com/(?:[^/]+/)*(?:v/(?:[^/]+/)?|videos/))(?P<id>\d{7,})'
25 _NETRC_MACHINE = 'globo'
26 _TESTS = [{
27 'url': 'http://g1.globo.com/carros/autoesporte/videos/t/exclusivos-do-g1/v/mercedes-benz-gla-passa-por-teste-de-colisao-na-europa/3607726/',
28 'info_dict': {
29 'id': '3607726',
30 'ext': 'mp4',
31 'title': 'Mercedes-Benz GLA passa por teste de colisão na Europa',
32 'duration': 103.204,
33 'uploader': 'G1',
34 'uploader_id': '2015',
35 },
36 'params': {
37 'skip_download': True,
38 },
39 }, {
40 'url': 'http://globoplay.globo.com/v/4581987/',
41 'info_dict': {
42 'id': '4581987',
43 'ext': 'mp4',
44 'title': 'Acidentes de trânsito estão entre as maiores causas de queda de energia em SP',
45 'duration': 137.973,
46 'uploader': 'Rede Globo',
47 'uploader_id': '196',
48 },
49 'params': {
50 'skip_download': True,
51 },
52 }, {
53 'url': 'http://canalbrasil.globo.com/programas/sangue-latino/videos/3928201.html',
54 'only_matching': True,
55 }, {
56 'url': 'http://globosatplay.globo.com/globonews/v/4472924/',
57 'only_matching': True,
58 }, {
59 'url': 'http://globotv.globo.com/t/programa/v/clipe-sexo-e-as-negas-adeus/3836166/',
60 'only_matching': True,
61 }, {
62 'url': 'http://globotv.globo.com/canal-brasil/sangue-latino/t/todos-os-videos/v/ator-e-diretor-argentino-ricado-darin-fala-sobre-utopias-e-suas-perdas/3928201/',
63 'only_matching': True,
64 }, {
65 'url': 'http://canaloff.globo.com/programas/desejar-profundo/videos/4518560.html',
66 'only_matching': True,
67 }, {
68 'url': 'globo:3607726',
69 'only_matching': True,
70 }]
71
72 def _real_extract(self, url):
73 video_id = self._match_id(url)
74
75 video = self._download_json(
76 'http://api.globovideos.com/videos/%s/playlist' % video_id,
77 video_id)['videos'][0]
78 if not self.get_param('allow_unplayable_formats') and video.get('encrypted') is True:
79 self.report_drm(video_id)
80
81 title = video['title']
82
83 formats = []
84 security = self._download_json(
85 'https://playback.video.globo.com/v1/video-session', video_id, 'Downloading security hash for %s' % video_id,
86 headers={'content-type': 'application/json'}, data=json.dumps({
87 "player_type": "desktop",
88 "video_id": video_id,
89 "quality": "max",
90 "content_protection": "widevine",
91 "vsid": "581b986b-4c40-71f0-5a58-803e579d5fa2",
92 "tz": "-3.0:00"
93 }).encode())
94
95 security_hash = security['source']['token']
96 if not security_hash:
97 message = security.get('message')
98 if message:
99 raise ExtractorError(
100 '%s returned error: %s' % (self.IE_NAME, message), expected=True)
101
102 hash_code = security_hash[:2]
103 padding = '%010d' % random.randint(1, 10000000000)
104 if hash_code in ('04', '14'):
105 received_time = security_hash[3:13]
106 received_md5 = security_hash[24:]
107 hash_prefix = security_hash[:23]
108 elif hash_code in ('02', '12', '03', '13'):
109 received_time = security_hash[2:12]
110 received_md5 = security_hash[22:]
111 padding += '1'
112 hash_prefix = '05' + security_hash[:22]
113
114 padded_sign_time = compat_str(int(received_time) + 86400) + padding
115 md5_data = (received_md5 + padded_sign_time + '0xAC10FD').encode()
116 signed_md5 = base64.urlsafe_b64encode(hashlib.md5(md5_data).digest()).decode().strip('=')
117 signed_hash = hash_prefix + padded_sign_time + signed_md5
118 source = security['source']['url_parts']
119 resource_url = source['scheme'] + '://' + source['domain'] + source['path']
120 signed_url = '%s?h=%s&k=html5&a=%s' % (resource_url, signed_hash, 'F' if video.get('subscriber_only') else 'A')
121
122 formats.extend(self._extract_m3u8_formats(
123 signed_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))
124 self._sort_formats(formats)
125
126 subtitles = {}
127 for resource in video['resources']:
128 if resource.get('type') == 'subtitle':
129 subtitles.setdefault(resource.get('language') or 'por', []).append({
130 'url': resource.get('url'),
131 })
132 subs = try_get(security, lambda x: x['source']['subtitles'], expected_type=dict) or {}
133 for sub_lang, sub_url in subs.items():
134 if sub_url:
135 subtitles.setdefault(sub_lang or 'por', []).append({
136 'url': sub_url,
137 })
138 subs = try_get(security, lambda x: x['source']['subtitles_webvtt'], expected_type=dict) or {}
139 for sub_lang, sub_url in subs.items():
140 if sub_url:
141 subtitles.setdefault(sub_lang or 'por', []).append({
142 'url': sub_url,
143 })
144
145 duration = float_or_none(video.get('duration'), 1000)
146 uploader = video.get('channel')
147 uploader_id = str_or_none(video.get('channel_id'))
148
149 return {
150 'id': video_id,
151 'title': title,
152 'duration': duration,
153 'uploader': uploader,
154 'uploader_id': uploader_id,
155 'formats': formats,
156 'subtitles': subtitles,
157 }
158
159
160 class GloboArticleIE(InfoExtractor):
161 _VALID_URL = r'https?://.+?\.globo\.com/(?:[^/]+/)*(?P<id>[^/.]+)(?:\.html)?'
162
163 _VIDEOID_REGEXES = [
164 r'\bdata-video-id=["\'](\d{7,})',
165 r'\bdata-player-videosids=["\'](\d{7,})',
166 r'\bvideosIDs\s*:\s*["\']?(\d{7,})',
167 r'\bdata-id=["\'](\d{7,})',
168 r'<div[^>]+\bid=["\'](\d{7,})',
169 ]
170
171 _TESTS = [{
172 'url': 'http://g1.globo.com/jornal-nacional/noticia/2014/09/novidade-na-fiscalizacao-de-bagagem-pela-receita-provoca-discussoes.html',
173 'info_dict': {
174 'id': 'novidade-na-fiscalizacao-de-bagagem-pela-receita-provoca-discussoes',
175 'title': 'Novidade na fiscalização de bagagem pela Receita provoca discussões',
176 'description': 'md5:c3c4b4d4c30c32fce460040b1ac46b12',
177 },
178 'playlist_count': 1,
179 }, {
180 'url': 'http://g1.globo.com/pr/parana/noticia/2016/09/mpf-denuncia-lula-marisa-e-mais-seis-na-operacao-lava-jato.html',
181 'info_dict': {
182 'id': 'mpf-denuncia-lula-marisa-e-mais-seis-na-operacao-lava-jato',
183 'title': "Lula era o 'comandante máximo' do esquema da Lava Jato, diz MPF",
184 'description': 'md5:8aa7cc8beda4dc71cc8553e00b77c54c',
185 },
186 'playlist_count': 6,
187 }, {
188 'url': 'http://gq.globo.com/Prazeres/Poder/noticia/2015/10/all-o-desafio-assista-ao-segundo-capitulo-da-serie.html',
189 'only_matching': True,
190 }, {
191 'url': 'http://gshow.globo.com/programas/tv-xuxa/O-Programa/noticia/2014/01/xuxa-e-junno-namoram-muuuito-em-luau-de-zeze-di-camargo-e-luciano.html',
192 'only_matching': True,
193 }, {
194 'url': 'http://oglobo.globo.com/rio/a-amizade-entre-um-entregador-de-farmacia-um-piano-19946271',
195 'only_matching': True,
196 }]
197
198 @classmethod
199 def suitable(cls, url):
200 return False if GloboIE.suitable(url) else super(GloboArticleIE, cls).suitable(url)
201
202 def _real_extract(self, url):
203 display_id = self._match_id(url)
204 webpage = self._download_webpage(url, display_id)
205 video_ids = []
206 for video_regex in self._VIDEOID_REGEXES:
207 video_ids.extend(re.findall(video_regex, webpage))
208 entries = [
209 self.url_result('globo:%s' % video_id, GloboIE.ie_key())
210 for video_id in orderedSet(video_ids)]
211 title = self._og_search_title(webpage, fatal=False)
212 description = self._html_search_meta('description', webpage)
213 return self.playlist_result(entries, display_id, title, description)