]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/arte.py
[extractor/youtube] Fix continuation loop with no comments (#7148)
[yt-dlp.git] / yt_dlp / extractor / arte.py
CommitLineData
d5822b96 1import re
d5822b96
PH
2
3from .common import InfoExtractor
4from ..utils import (
c0892b2b 5 ExtractorError,
051d6b45 6 GeoRestrictedError,
d24a2b20 7 int_or_none,
051d6b45 8 parse_iso8601,
4dfbf869 9 parse_qs,
50e93e03 10 strip_or_none,
051d6b45 11 traverse_obj,
8bdd16b4 12 url_or_none,
d5822b96
PH
13)
14
d5822b96 15
6e6b9f60 16class ArteTVBaseIE(InfoExtractor):
8bdd16b4 17 _ARTE_LANGUAGES = 'fr|de|en|es|it|pl'
051d6b45 18 _API_BASE = 'https://api.arte.tv/api/player/v2'
8bdd16b4 19
20
21class ArteTVIE(ArteTVBaseIE):
22 _VALID_URL = r'''(?x)
051d6b45 23 (?:https?://
8bdd16b4 24 (?:
25 (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos|
26 api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
27 )
051d6b45
F
28 |arte://program)
29 /(?P<id>\d{6}-\d{3}-[AF]|LIVE)
8bdd16b4 30 ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
31 _TESTS = [{
32 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
051d6b45
F
33 'only_matching': True,
34 }, {
35 'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/',
8bdd16b4 36 'info_dict': {
051d6b45
F
37 'id': '100103-000-A',
38 'title': 'USA: Dyskryminacja na porodówce',
39 'description': 'md5:242017b7cce59ffae340a54baefcafb1',
40 'alt_title': 'ARTE Reportage',
41 'upload_date': '20201103',
42 'duration': 554,
43 'thumbnail': r're:https://api-cdn\.arte\.tv/.+940x530',
44 'timestamp': 1604417980,
8bdd16b4 45 'ext': 'mp4',
8bdd16b4 46 },
051d6b45 47 'params': {'skip_download': 'm3u8'}
f640e42f 48 }, {
49 'note': 'No alt_title',
50 'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/',
51 'info_dict': {
52 'id': '110371-000-A',
53 'ext': 'mp4',
54 'upload_date': '20220718',
55 'duration': 154,
56 'timestamp': 1658162460,
57 'description': 'md5:5890f36fe7dccfadb8b7c0891de54786',
58 'title': 'La chaleur, supplice des arbres de rue',
59 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/CPE2sQDtD8GLQgt8DuYHLf/940x530',
60 },
61 'params': {'skip_download': 'm3u8'}
8bdd16b4 62 }, {
051d6b45 63 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
8bdd16b4 64 'only_matching': True,
65 }, {
051d6b45 66 'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE',
8bdd16b4 67 'only_matching': True,
15e9e578 68 }, {
69 'url': 'https://www.arte.tv/de/videos/110203-006-A/zaz/',
70 'info_dict': {
71 'id': '110203-006-A',
72 'chapters': 'count:16',
73 'description': 'md5:cf592f1df52fe52007e3f8eac813c084',
74 'alt_title': 'Zaz',
75 'title': 'Baloise Session 2022',
76 'timestamp': 1668445200,
77 'duration': 4054,
78 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/ubQjmVCGyRx3hmBuZEK9QZ/940x530',
79 'upload_date': '20221114',
80 'ext': 'mp4',
81 },
82 'expected_warnings': ['geo restricted']
8bdd16b4 83 }]
84
051d6b45
F
85 _GEO_BYPASS = True
86
87 _LANG_MAP = { # ISO639 -> French abbreviations
88 'fr': 'F',
89 'de': 'A',
90 'en': 'E[ANG]',
91 'es': 'E[ESP]',
92 'it': 'E[ITA]',
93 'pl': 'E[POL]',
94 # XXX: probably means mixed; <https://www.arte.tv/en/videos/107710-029-A/dispatches-from-ukraine-local-journalists-report/>
95 # uses this code for audio that happens to be in Ukrainian, but the manifest uses the ISO code 'mul' (mixed)
96 'mul': 'EU',
97 }
98
99 _VERSION_CODE_RE = re.compile(r'''(?x)
100 V
101 (?P<original_voice>O?)
102 (?P<vlang>[FA]|E\[[A-Z]+\]|EU)?
103 (?P<audio_desc>AUD|)
104 (?:
105 (?P<has_sub>-ST)
106 (?P<sdh_sub>M?)
107 (?P<sub_lang>[FA]|E\[[A-Z]+\]|EU)
108 )?
109 ''')
110
111 # all obtained by exhaustive testing
112 _COUNTRIES_MAP = {
89e4d861 113 'DE_FR': (
051d6b45
F
114 'BL', 'DE', 'FR', 'GF', 'GP', 'MF', 'MQ', 'NC',
115 'PF', 'PM', 'RE', 'WF', 'YT',
89e4d861 116 ),
051d6b45 117 # with both of the below 'BE' sometimes works, sometimes doesn't
89e4d861 118 'EUR_DE_FR': (
051d6b45
F
119 'AT', 'BL', 'CH', 'DE', 'FR', 'GF', 'GP', 'LI',
120 'MC', 'MF', 'MQ', 'NC', 'PF', 'PM', 'RE', 'WF',
121 'YT',
89e4d861 122 ),
123 'SAT': (
051d6b45
F
124 'AD', 'AT', 'AX', 'BG', 'BL', 'CH', 'CY', 'CZ',
125 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GF',
126 'GR', 'HR', 'HU', 'IE', 'IS', 'IT', 'KN', 'LI',
127 'LT', 'LU', 'LV', 'MC', 'MF', 'MQ', 'MT', 'NC',
128 'NL', 'NO', 'PF', 'PL', 'PM', 'PT', 'RE', 'RO',
129 'SE', 'SI', 'SK', 'SM', 'VA', 'WF', 'YT',
89e4d861 130 ),
051d6b45
F
131 }
132
8bdd16b4 133 def _real_extract(self, url):
5ad28e7f 134 mobj = self._match_valid_url(url)
8bdd16b4 135 video_id = mobj.group('id')
136 lang = mobj.group('lang') or mobj.group('lang_2')
051d6b45
F
137 langauge_code = self._LANG_MAP.get(lang)
138
139 config = self._download_json(f'{self._API_BASE}/config/{lang}/{video_id}', video_id)
140
141 geoblocking = traverse_obj(config, ('data', 'attributes', 'restriction', 'geoblocking')) or {}
142 if geoblocking.get('restrictedArea'):
143 raise GeoRestrictedError(f'Video restricted to {geoblocking["code"]!r}',
144 countries=self._COUNTRIES_MAP.get(geoblocking['code'], ('DE', 'FR')))
145
146 if not traverse_obj(config, ('data', 'attributes', 'rights')):
147 # Eg: https://www.arte.tv/de/videos/097407-215-A/28-minuten
148 # Eg: https://www.arte.tv/es/videos/104351-002-A/serviteur-du-peuple-1-23
149 raise ExtractorError(
150 'Video is not available in this language edition of Arte or broadcast rights expired', expected=True)
151
152 formats, subtitles = [], {}
1534aba8 153 secondary_formats = []
051d6b45
F
154 for stream in config['data']['attributes']['streams']:
155 # official player contains code like `e.get("versions")[0].eStat.ml5`
156 stream_version = stream['versions'][0]
157 stream_version_code = stream_version['eStat']['ml5']
158
159 lang_pref = -1
160 m = self._VERSION_CODE_RE.match(stream_version_code)
161 if m:
162 lang_pref = int(''.join('01'[x] for x in (
163 m.group('vlang') == langauge_code, # we prefer voice in the requested language
164 not m.group('audio_desc'), # and not the audio description version
165 bool(m.group('original_voice')), # but if voice is not in the requested language, at least choose the original voice
166 m.group('sub_lang') == langauge_code, # if subtitles are present, we prefer them in the requested language
167 not m.group('has_sub'), # but we prefer no subtitles otherwise
168 not m.group('sdh_sub'), # and we prefer not the hard-of-hearing subtitles if there are subtitles
169 )))
170
1534aba8 171 short_label = traverse_obj(stream_version, 'shortLabel', expected_type=str, default='?')
051d6b45
F
172 if stream['protocol'].startswith('HLS'):
173 fmts, subs = self._extract_m3u8_formats_and_subtitles(
174 stream['url'], video_id=video_id, ext='mp4', m3u8_id=stream_version_code, fatal=False)
175 for fmt in fmts:
176 fmt.update({
1534aba8 177 'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
25791435 178 'language_preference': lang_pref,
25791435 179 })
1534aba8
SS
180 if any(map(short_label.startswith, ('cc', 'OGsub'))):
181 secondary_formats.extend(fmts)
182 else:
183 formats.extend(fmts)
051d6b45
F
184 self._merge_subtitles(subs, target=subtitles)
185
186 elif stream['protocol'] in ('HTTPS', 'RTMP'):
187 formats.append({
188 'format_id': f'{stream["protocol"]}-{stream_version_code}',
189 'url': stream['url'],
1534aba8 190 'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
051d6b45
F
191 'language_preference': lang_pref,
192 # 'ext': 'mp4', # XXX: may or may not be necessary, at least for HTTPS
193 })
194
c40f5cf4 195 else:
051d6b45
F
196 self.report_warning(f'Skipping stream with unknown protocol {stream["protocol"]}')
197
1534aba8
SS
198 formats.extend(secondary_formats)
199 self._remove_duplicate_formats(formats)
aff2f4f4 200
051d6b45 201 metadata = config['data']['attributes']['metadata']
c40f5cf4 202
8bdd16b4 203 return {
051d6b45
F
204 'id': metadata['providerId'],
205 'webpage_url': traverse_obj(metadata, ('link', 'url')),
f640e42f 206 'title': traverse_obj(metadata, 'subtitle', 'title'),
207 'alt_title': metadata.get('subtitle') and metadata.get('title'),
051d6b45
F
208 'description': metadata.get('description'),
209 'duration': traverse_obj(metadata, ('duration', 'seconds')),
210 'language': metadata.get('language'),
211 'timestamp': traverse_obj(config, ('data', 'attributes', 'rights', 'begin'), expected_type=parse_iso8601),
212 'is_live': config['data']['attributes'].get('live', False),
8bdd16b4 213 'formats': formats,
051d6b45
F
214 'subtitles': subtitles,
215 'thumbnails': [
216 {'url': image['url'], 'id': image.get('caption')}
217 for image in metadata.get('images') or [] if url_or_none(image.get('url'))
218 ],
15e9e578 219 # TODO: chapters may also be in stream['segments']?
220 'chapters': traverse_obj(config, ('data', 'attributes', 'chapters', 'elements', ..., {
221 'start_time': 'startTime',
222 'title': 'title',
223 })) or None,
8bdd16b4 224 }
c40f5cf4 225
24114fee 226
8bdd16b4 227class ArteTVEmbedIE(InfoExtractor):
228 _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
bfd973ec 229 _EMBED_REGEX = [r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1']
9c54ae33 230 _TESTS = [{
8bdd16b4 231 'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
9c54ae33 232 'info_dict': {
8bdd16b4 233 'id': '100605-013-A',
9c54ae33 234 'ext': 'mp4',
8bdd16b4 235 'title': 'United we Stream November Lockdown Edition #13',
236 'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
237 'upload_date': '20201116',
69a0c470 238 },
051d6b45 239 'skip': 'No video available'
8bdd16b4 240 }, {
241 'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
242 'only_matching': True,
9c54ae33 243 }]
56a8ab7d 244
893f8832 245 def _real_extract(self, url):
4dfbf869 246 qs = parse_qs(url)
8bdd16b4 247 json_url = qs['json_url'][0]
248 video_id = ArteTVIE._match_id(json_url)
249 return self.url_result(
250 json_url, ie=ArteTVIE.ie_key(), video_id=video_id)
4b492e35
S
251
252
6e6b9f60 253class ArteTVPlaylistIE(ArteTVBaseIE):
8bdd16b4 254 _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES
6e6b9f60 255 _TESTS = [{
ff0f4cfe 256 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
051d6b45 257 'only_matching': True,
8bdd16b4 258 }, {
259 'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
051d6b45
F
260 'playlist_mincount': 100,
261 'info_dict': {
262 'description': 'md5:84e7bf1feda248bc325ebfac818c476e',
263 'id': 'RC-014123',
264 'title': 'ARTE Reportage - najlepsze reportaże',
265 },
6e6b9f60
S
266 }]
267
268 def _real_extract(self, url):
051d6b45
F
269 lang, playlist_id = self._match_valid_url(url).group('lang', 'id')
270 playlist = self._download_json(
271 f'{self._API_BASE}/playlist/{lang}/{playlist_id}', playlist_id)['data']['attributes']
272
273 entries = [{
274 '_type': 'url_transparent',
275 'url': video['config']['url'],
276 'ie_key': ArteTVIE.ie_key(),
277 'id': video.get('providerId'),
278 'title': video.get('title'),
279 'alt_title': video.get('subtitle'),
280 'thumbnail': url_or_none(traverse_obj(video, ('mainImage', 'url'))),
281 'duration': int_or_none(traverse_obj(video, ('duration', 'seconds'))),
282 } for video in traverse_obj(playlist, ('items', lambda _, v: v['config']['url']))]
283
284 return self.playlist_result(entries, playlist_id,
285 traverse_obj(playlist, ('metadata', 'title')),
286 traverse_obj(playlist, ('metadata', 'description')))
50e93e03 287
288
289class ArteTVCategoryIE(ArteTVBaseIE):
290 _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE._ARTE_LANGUAGES
291 _TESTS = [{
292 'url': 'https://www.arte.tv/en/videos/politics-and-society/',
293 'info_dict': {
294 'id': 'politics-and-society',
295 'title': 'Politics and society',
296 'description': 'Investigative documentary series, geopolitical analysis, and international commentary',
297 },
298 'playlist_mincount': 13,
051d6b45 299 }]
50e93e03 300
301 @classmethod
302 def suitable(cls, url):
303 return (
304 not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, ))
051d6b45 305 and super().suitable(url))
50e93e03 306
307 def _real_extract(self, url):
308 lang, playlist_id = self._match_valid_url(url).groups()
309 webpage = self._download_webpage(url, playlist_id)
310
311 items = []
312 for video in re.finditer(
313 r'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|\b)(?P<url>https?://www\.arte\.tv/%s/videos/[\w/-]+)(?P=q)' % lang,
314 webpage):
315 video = video.group('url')
316 if video == url:
317 continue
318 if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )):
319 items.append(video)
320
62b8dac4 321 title = strip_or_none(self._generic_title('', webpage, default='').rsplit('|', 1)[0]) or None
50e93e03 322
323 return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title,
324 description=self._og_search_description(webpage, default=None))