]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/kinja.py
[ie/orf:on] Improve extraction (#9677)
[yt-dlp.git] / yt_dlp / extractor / kinja.py
CommitLineData
55adb63e
RA
1from .common import InfoExtractor
2from ..compat import (
3 compat_str,
4 compat_urllib_parse_unquote,
5)
6from ..utils import (
7 int_or_none,
8 parse_iso8601,
9 strip_or_none,
10 try_get,
55adb63e
RA
11)
12
13
14class KinjaEmbedIE(InfoExtractor):
bc4ab17b 15 IE_NAME = 'kinja:embed'
55adb63e
RA
16 _DOMAIN_REGEX = r'''(?:[^.]+\.)?
17 (?:
18 avclub|
19 clickhole|
20 deadspin|
21 gizmodo|
22 jalopnik|
23 jezebel|
24 kinja|
25 kotaku|
26 lifehacker|
27 splinternews|
28 the(?:inventory|onion|root|takeout)
29 )\.com'''
30 _COMMON_REGEX = r'''/
31 (?:
32 ajax/inset|
33 embed/video
34 )/iframe\?.*?\bid='''
35 _VALID_URL = r'''(?x)https?://%s%s
36 (?P<type>
37 fb|
38 imgur|
39 instagram|
40 jwp(?:layer)?-video|
41 kinjavideo|
42 mcp|
43 megaphone|
55adb63e
RA
44 soundcloud(?:-playlist)?|
45 tumblr-post|
46 twitch-stream|
47 twitter|
48 ustream-channel|
49 vimeo|
50 vine|
51 youtube-(?:list|video)
52 )-(?P<id>[^&]+)''' % (_DOMAIN_REGEX, _COMMON_REGEX)
bfd973ec 53 _EMBED_REGEX = [rf'(?x)<iframe[^>]+?src=(?P<q>["\'])(?P<url>(?:(?:https?:)?//{_DOMAIN_REGEX})?{_COMMON_REGEX}(?:(?!\1).)+)\1']
55adb63e
RA
54 _TESTS = [{
55 'url': 'https://kinja.com/ajax/inset/iframe?id=fb-10103303356633621',
56 'only_matching': True,
57 }, {
58 'url': 'https://kinja.com/ajax/inset/iframe?id=kinjavideo-100313',
59 'only_matching': True,
60 }, {
61 'url': 'https://kinja.com/ajax/inset/iframe?id=megaphone-PPY1300931075',
62 'only_matching': True,
55adb63e
RA
63 }, {
64 'url': 'https://kinja.com/ajax/inset/iframe?id=soundcloud-128574047',
65 'only_matching': True,
66 }, {
67 'url': 'https://kinja.com/ajax/inset/iframe?id=soundcloud-playlist-317413750',
68 'only_matching': True,
69 }, {
70 'url': 'https://kinja.com/ajax/inset/iframe?id=tumblr-post-160130699814-daydreams-at-midnight',
71 'only_matching': True,
72 }, {
73 'url': 'https://kinja.com/ajax/inset/iframe?id=twitch-stream-libratus_extra',
74 'only_matching': True,
75 }, {
76 'url': 'https://kinja.com/ajax/inset/iframe?id=twitter-1068875942473404422',
77 'only_matching': True,
78 }, {
79 'url': 'https://kinja.com/ajax/inset/iframe?id=ustream-channel-10414700',
80 'only_matching': True,
81 }, {
82 'url': 'https://kinja.com/ajax/inset/iframe?id=vimeo-120153502',
83 'only_matching': True,
84 }, {
85 'url': 'https://kinja.com/ajax/inset/iframe?id=vine-5BlvV5qqPrD',
86 'only_matching': True,
87 }, {
88 'url': 'https://kinja.com/ajax/inset/iframe?id=youtube-list-BCQ3KyrPjgA/PLE6509247C270A72E',
89 'only_matching': True,
90 }, {
91 'url': 'https://kinja.com/ajax/inset/iframe?id=youtube-video-00QyL0AgPAE',
92 'only_matching': True,
93 }]
94 _JWPLATFORM_PROVIDER = ('cdn.jwplayer.com/v2/media/', 'JWPlatform')
95 _PROVIDER_MAP = {
96 'fb': ('facebook.com/video.php?v=', 'Facebook'),
97 'imgur': ('imgur.com/', 'Imgur'),
98 'instagram': ('instagram.com/p/', 'Instagram'),
99 'jwplayer-video': _JWPLATFORM_PROVIDER,
100 'jwp-video': _JWPLATFORM_PROVIDER,
101 'megaphone': ('player.megaphone.fm/', 'Generic'),
55adb63e
RA
102 'soundcloud': ('api.soundcloud.com/tracks/', 'Soundcloud'),
103 'soundcloud-playlist': ('api.soundcloud.com/playlists/', 'SoundcloudPlaylist'),
104 'tumblr-post': ('%s.tumblr.com/post/%s', 'Tumblr'),
105 'twitch-stream': ('twitch.tv/', 'TwitchStream'),
106 'twitter': ('twitter.com/i/cards/tfw/v1/', 'TwitterCard'),
107 'ustream-channel': ('ustream.tv/embed/', 'Ustream'),
108 'vimeo': ('vimeo.com/', 'Vimeo'),
109 'vine': ('vine.co/v/', 'Vine'),
110 'youtube-list': ('youtube.com/embed/%s?list=%s', 'YoutubePlaylist'),
111 'youtube-video': ('youtube.com/embed/', 'Youtube'),
112 }
113
55adb63e 114 def _real_extract(self, url):
5ad28e7f 115 video_type, video_id = self._match_valid_url(url).groups()
55adb63e
RA
116
117 provider = self._PROVIDER_MAP.get(video_type)
118 if provider:
119 video_id = compat_urllib_parse_unquote(video_id)
120 if video_type == 'tumblr-post':
121 video_id, blog = video_id.split('-', 1)
122 result_url = provider[0] % (blog, video_id)
123 elif video_type == 'youtube-list':
124 video_id, playlist_id = video_id.split('/')
125 result_url = provider[0] % (video_id, playlist_id)
126 else:
55adb63e
RA
127 result_url = provider[0] + video_id
128 return self.url_result('http://' + result_url, provider[1])
129
130 if video_type == 'kinjavideo':
131 data = self._download_json(
132 'https://kinja.com/api/core/video/views/videoById',
133 video_id, query={'videoId': video_id})['data']
134 title = data['title']
135
136 formats = []
137 for k in ('signedPlaylist', 'streaming'):
138 m3u8_url = data.get(k + 'Url')
139 if m3u8_url:
140 formats.extend(self._extract_m3u8_formats(
141 m3u8_url, video_id, 'mp4', 'm3u8_native',
142 m3u8_id='hls', fatal=False))
55adb63e
RA
143
144 thumbnail = None
145 poster = data.get('poster') or {}
146 poster_id = poster.get('id')
147 if poster_id:
148 thumbnail = 'https://i.kinja-img.com/gawker-media/image/upload/%s.%s' % (poster_id, poster.get('format') or 'jpg')
149
150 return {
151 'id': video_id,
152 'title': title,
153 'description': strip_or_none(data.get('description')),
154 'formats': formats,
155 'tags': data.get('tags'),
156 'timestamp': int_or_none(try_get(
157 data, lambda x: x['postInfo']['publishTimeMillis']), 1000),
158 'thumbnail': thumbnail,
159 'uploader': data.get('network'),
160 }
161 else:
162 video_data = self._download_json(
163 'https://api.vmh.univision.com/metadata/v1/content/' + video_id,
164 video_id)['videoMetadata']
165 iptc = video_data['photoVideoMetadataIPTC']
166 title = iptc['title']['en']
167 fmg = video_data.get('photoVideoMetadata_fmg') or {}
168 tvss_domain = fmg.get('tvssDomain') or 'https://auth.univision.com'
169 data = self._download_json(
170 tvss_domain + '/api/v3/video-auth/url-signature-tokens',
171 video_id, query={'mcpids': video_id})['data'][0]
172 formats = []
173
174 rendition_url = data.get('renditionUrl')
175 if rendition_url:
176 formats = self._extract_m3u8_formats(
177 rendition_url, video_id, 'mp4',
178 'm3u8_native', m3u8_id='hls', fatal=False)
179
180 fallback_rendition_url = data.get('fallbackRenditionUrl')
181 if fallback_rendition_url:
182 formats.append({
183 'format_id': 'fallback',
184 'tbr': int_or_none(self._search_regex(
185 r'_(\d+)\.mp4', fallback_rendition_url,
186 'bitrate', default=None)),
187 'url': fallback_rendition_url,
188 })
189
55adb63e
RA
190 return {
191 'id': video_id,
192 'title': title,
193 'thumbnail': try_get(iptc, lambda x: x['cloudinaryLink']['link'], compat_str),
194 'uploader': fmg.get('network'),
195 'duration': int_or_none(iptc.get('fileDuration')),
196 'formats': formats,
197 'description': try_get(iptc, lambda x: x['description']['en'], compat_str),
198 'timestamp': parse_iso8601(iptc.get('dateReleased')),
199 }