]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/globo.py
[cleanup,youtube] Reorganize Tab and Search extractor inheritances
[yt-dlp.git] / yt_dlp / extractor / globo.py
CommitLineData
f47754f0
S
1# coding: utf-8
2from __future__ import unicode_literals
3
db2058f6
RA
4import base64
5import hashlib
6import json
f47754f0 7import random
26394d02 8import re
f47754f0
S
9
10from .common import InfoExtractor
e5187493 11from ..compat import (
e5187493
RA
12 compat_str,
13)
8c25f81b
PH
14from ..utils import (
15 ExtractorError,
16 float_or_none,
26394d02 17 orderedSet,
e7d34c03 18 str_or_none,
b89378a6 19 try_get,
8c25f81b 20)
f47754f0
S
21
22
23class GloboIE(InfoExtractor):
25042f73 24 _VALID_URL = r'(?:globo:|https?://.+?\.globo\.com/(?:[^/]+/)*(?:v/(?:[^/]+/)?|videos/))(?P<id>\d{7,})'
d81ffc3a 25 _NETRC_MACHINE = 'globo'
ad607563 26 _TESTS = [{
ad607563 27 'url': 'http://g1.globo.com/carros/autoesporte/videos/t/exclusivos-do-g1/v/mercedes-benz-gla-passa-por-teste-de-colisao-na-europa/3607726/',
ad607563
S
28 'info_dict': {
29 'id': '3607726',
30 'ext': 'mp4',
31 'title': 'Mercedes-Benz GLA passa por teste de colisão na Europa',
32 'duration': 103.204,
b89378a6
AG
33 'uploader': 'G1',
34 'uploader_id': '2015',
35 },
36 'params': {
37 'skip_download': True,
264cd00f 38 },
ad607563 39 }, {
264cd00f 40 'url': 'http://globoplay.globo.com/v/4581987/',
ad607563 41 'info_dict': {
264cd00f 42 'id': '4581987',
ad607563 43 'ext': 'mp4',
264cd00f
S
44 'title': 'Acidentes de trânsito estão entre as maiores causas de queda de energia em SP',
45 'duration': 137.973,
46 'uploader': 'Rede Globo',
e7d34c03 47 'uploader_id': '196',
264cd00f 48 },
b89378a6
AG
49 'params': {
50 'skip_download': True,
51 },
264cd00f
S
52 }, {
53 'url': 'http://canalbrasil.globo.com/programas/sangue-latino/videos/3928201.html',
54 'only_matching': True,
55 }, {
56 'url': 'http://globosatplay.globo.com/globonews/v/4472924/',
57 'only_matching': True,
58 }, {
59 'url': 'http://globotv.globo.com/t/programa/v/clipe-sexo-e-as-negas-adeus/3836166/',
60 'only_matching': True,
61 }, {
62 'url': 'http://globotv.globo.com/canal-brasil/sangue-latino/t/todos-os-videos/v/ator-e-diretor-argentino-ricado-darin-fala-sobre-utopias-e-suas-perdas/3928201/',
63 'only_matching': True,
5d501a09
S
64 }, {
65 'url': 'http://canaloff.globo.com/programas/desejar-profundo/videos/4518560.html',
66 'only_matching': True,
26394d02
S
67 }, {
68 'url': 'globo:3607726',
69 'only_matching': True,
ad607563 70 }]
f47754f0 71
f47754f0
S
72 def _real_extract(self, url):
73 video_id = self._match_id(url)
74
f47754f0 75 video = self._download_json(
db2058f6
RA
76 'http://api.globovideos.com/videos/%s/playlist' % video_id,
77 video_id)['videos'][0]
a06916d9 78 if not self.get_param('allow_unplayable_formats') and video.get('encrypted') is True:
88acdbc2 79 self.report_drm(video_id)
f47754f0
S
80
81 title = video['title']
f47754f0
S
82
83 formats = []
b89378a6
AG
84 security = self._download_json(
85 'https://playback.video.globo.com/v1/video-session', video_id, 'Downloading security hash for %s' % video_id,
86 headers={'content-type': 'application/json'}, data=json.dumps({
87 "player_type": "desktop",
88 "video_id": video_id,
89 "quality": "max",
90 "content_protection": "widevine",
91 "vsid": "581b986b-4c40-71f0-5a58-803e579d5fa2",
92 "tz": "-3.0:00"
93 }).encode())
94
95 security_hash = security['source']['token']
96 if not security_hash:
97 message = security.get('message')
98 if message:
99 raise ExtractorError(
100 '%s returned error: %s' % (self.IE_NAME, message), expected=True)
101
102 hash_code = security_hash[:2]
103 padding = '%010d' % random.randint(1, 10000000000)
104 if hash_code in ('04', '14'):
105 received_time = security_hash[3:13]
106 received_md5 = security_hash[24:]
107 hash_prefix = security_hash[:23]
108 elif hash_code in ('02', '12', '03', '13'):
109 received_time = security_hash[2:12]
110 received_md5 = security_hash[22:]
111 padding += '1'
112 hash_prefix = '05' + security_hash[:22]
113
114 padded_sign_time = compat_str(int(received_time) + 86400) + padding
115 md5_data = (received_md5 + padded_sign_time + '0xAC10FD').encode()
116 signed_md5 = base64.urlsafe_b64encode(hashlib.md5(md5_data).digest()).decode().strip('=')
117 signed_hash = hash_prefix + padded_sign_time + signed_md5
118 source = security['source']['url_parts']
119 resource_url = source['scheme'] + '://' + source['domain'] + source['path']
120 signed_url = '%s?h=%s&k=html5&a=%s' % (resource_url, signed_hash, 'F' if video.get('subscriber_only') else 'A')
121
122 formats.extend(self._extract_m3u8_formats(
123 signed_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))
124 self._sort_formats(formats)
125
30eb05cb 126 subtitles = {}
f47754f0 127 for resource in video['resources']:
b89378a6 128 if resource.get('type') == 'subtitle':
30eb05cb 129 subtitles.setdefault(resource.get('language') or 'por', []).append({
b89378a6 130 'url': resource.get('url'),
30eb05cb 131 })
b89378a6
AG
132 subs = try_get(security, lambda x: x['source']['subtitles'], expected_type=dict) or {}
133 for sub_lang, sub_url in subs.items():
134 if sub_url:
135 subtitles.setdefault(sub_lang or 'por', []).append({
136 'url': sub_url,
db2058f6 137 })
b89378a6
AG
138 subs = try_get(security, lambda x: x['source']['subtitles_webvtt'], expected_type=dict) or {}
139 for sub_lang, sub_url in subs.items():
140 if sub_url:
141 subtitles.setdefault(sub_lang or 'por', []).append({
142 'url': sub_url,
8c72beb2 143 })
f47754f0 144
fffccaaf 145 duration = float_or_none(video.get('duration'), 1000)
fffccaaf 146 uploader = video.get('channel')
e7d34c03 147 uploader_id = str_or_none(video.get('channel_id'))
fffccaaf 148
f47754f0
S
149 return {
150 'id': video_id,
151 'title': title,
152 'duration': duration,
153 'uploader': uploader,
154 'uploader_id': uploader_id,
30eb05cb
RA
155 'formats': formats,
156 'subtitles': subtitles,
5f6a1245 157 }
ad607563
S
158
159
160class GloboArticleIE(InfoExtractor):
26394d02 161 _VALID_URL = r'https?://.+?\.globo\.com/(?:[^/]+/)*(?P<id>[^/.]+)(?:\.html)?'
ad607563
S
162
163 _VIDEOID_REGEXES = [
164 r'\bdata-video-id=["\'](\d{7,})',
165 r'\bdata-player-videosids=["\'](\d{7,})',
9e5751b9 166 r'\bvideosIDs\s*:\s*["\']?(\d{7,})',
ad607563
S
167 r'\bdata-id=["\'](\d{7,})',
168 r'<div[^>]+\bid=["\'](\d{7,})',
169 ]
170
5d501a09 171 _TESTS = [{
ad607563 172 'url': 'http://g1.globo.com/jornal-nacional/noticia/2014/09/novidade-na-fiscalizacao-de-bagagem-pela-receita-provoca-discussoes.html',
ad607563 173 'info_dict': {
26394d02
S
174 'id': 'novidade-na-fiscalizacao-de-bagagem-pela-receita-provoca-discussoes',
175 'title': 'Novidade na fiscalização de bagagem pela Receita provoca discussões',
176 'description': 'md5:c3c4b4d4c30c32fce460040b1ac46b12',
177 },
178 'playlist_count': 1,
179 }, {
180 'url': 'http://g1.globo.com/pr/parana/noticia/2016/09/mpf-denuncia-lula-marisa-e-mais-seis-na-operacao-lava-jato.html',
181 'info_dict': {
182 'id': 'mpf-denuncia-lula-marisa-e-mais-seis-na-operacao-lava-jato',
183 'title': "Lula era o 'comandante máximo' do esquema da Lava Jato, diz MPF",
184 'description': 'md5:8aa7cc8beda4dc71cc8553e00b77c54c',
185 },
186 'playlist_count': 6,
5d501a09
S
187 }, {
188 'url': 'http://gq.globo.com/Prazeres/Poder/noticia/2015/10/all-o-desafio-assista-ao-segundo-capitulo-da-serie.html',
189 'only_matching': True,
190 }, {
191 'url': 'http://gshow.globo.com/programas/tv-xuxa/O-Programa/noticia/2014/01/xuxa-e-junno-namoram-muuuito-em-luau-de-zeze-di-camargo-e-luciano.html',
192 'only_matching': True,
9e5751b9
S
193 }, {
194 'url': 'http://oglobo.globo.com/rio/a-amizade-entre-um-entregador-de-farmacia-um-piano-19946271',
195 'only_matching': True,
5d501a09 196 }]
ad607563
S
197
198 @classmethod
199 def suitable(cls, url):
200 return False if GloboIE.suitable(url) else super(GloboArticleIE, cls).suitable(url)
201
202 def _real_extract(self, url):
203 display_id = self._match_id(url)
204 webpage = self._download_webpage(url, display_id)
26394d02
S
205 video_ids = []
206 for video_regex in self._VIDEOID_REGEXES:
207 video_ids.extend(re.findall(video_regex, webpage))
208 entries = [
209 self.url_result('globo:%s' % video_id, GloboIE.ie_key())
210 for video_id in orderedSet(video_ids)]
211 title = self._og_search_title(webpage, fatal=False)
212 description = self._html_search_meta('description', webpage)
213 return self.playlist_result(entries, display_id, title, description)