yt_dlp/extractor/arte.py

   1 import re
   2
   3 from .common import InfoExtractor
   4 from ..utils import (
   5     ExtractorError,
   6     GeoRestrictedError,
   7     int_or_none,
   8     join_nonempty,
   9     parse_iso8601,
  10     parse_qs,
  11     strip_or_none,
  12     traverse_obj,
  13     url_or_none,
  14 )
  15
  16
  17 class ArteTVBaseIE(InfoExtractor):
  18     _ARTE_LANGUAGES = 'fr|de|en|es|it|pl'
  19     _API_BASE = 'https://api.arte.tv/api/player/v2'
  20
  21
  22 class ArteTVIE(ArteTVBaseIE):
  23     _VALID_URL = r'''(?x)
  24                     (?:https?://
  25                         (?:
  26                             (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos|
  27                             api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
  28                         )
  29                     |arte://program)
  30                         /(?P<id>\d{6}-\d{3}-[AF]|LIVE)
  31                     ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
  32     _TESTS = [{
  33         'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
  34         'only_matching': True,
  35     }, {
  36         'note': 'No alt_title',
  37         'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/',
  38         'only_matching': True,
  39     }, {
  40         'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
  41         'only_matching': True,
  42     }, {
  43         'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE',
  44         'only_matching': True,
  45     }, {
  46         'url': 'https://www.arte.tv/de/videos/110203-006-A/zaz/',
  47         'only_matching': True,
  48     }, {
  49         'url': 'https://www.arte.tv/fr/videos/109067-000-A/la-loi-de-teheran/',
  50         'info_dict': {
  51             'id': '109067-000-A',
  52             'ext': 'mp4',
  53             'description': 'md5:d2ca367b8ecee028dddaa8bd1aebc739',
  54             'timestamp': 1713927600,
  55             'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/3rR6PLzfbigSkkeHtkCZNF/940x530',
  56             'duration': 7599,
  57             'title': 'La loi de Téhéran',
  58             'upload_date': '20240424',
  59             'subtitles': {
  60                 'fr': 'mincount:1',
  61                 'fr-acc': 'mincount:1',
  62                 'fr-forced': 'mincount:1',
  63             },
  64         },
  65     }, {
  66         'note': 'age-restricted',
  67         'url': 'https://www.arte.tv/de/videos/006785-000-A/the-element-of-crime/',
  68         'info_dict': {
  69             'id': '006785-000-A',
  70             'description': 'md5:c2f94fdfefc8a280e4dab68ab96ab0ba',
  71             'title': 'The Element of Crime',
  72             'timestamp': 1696111200,
  73             'duration': 5849,
  74             'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/q82dTTfyuCXupPsGxXsd7B/940x530',
  75             'upload_date': '20230930',
  76             'ext': 'mp4',
  77         },
  78         'skip': '404 Not Found',
  79     }]
  80
  81     _GEO_BYPASS = True
  82
  83     _LANG_MAP = {  # ISO639 -> French abbreviations
  84         'fr': 'F',
  85         'de': 'A',
  86         'en': 'E[ANG]',
  87         'es': 'E[ESP]',
  88         'it': 'E[ITA]',
  89         'pl': 'E[POL]',
  90         # XXX: probably means mixed; <https://www.arte.tv/en/videos/107710-029-A/dispatches-from-ukraine-local-journalists-report/>
  91         # uses this code for audio that happens to be in Ukrainian, but the manifest uses the ISO code 'mul' (mixed)
  92         'mul': 'EU',
  93     }
  94
  95     _VERSION_CODE_RE = re.compile(r'''(?x)
  96         V
  97         (?P<original_voice>O?)
  98         (?P<vlang>[FA]|E\[[A-Z]+\]|EU)?
  99         (?P<audio_desc>AUD|)
 100         (?:
 101             (?P<has_sub>-ST)
 102             (?P<sdh_sub>M?)
 103             (?P<sub_lang>[FA]|E\[[A-Z]+\]|EU)
 104         )?
 105     ''')
 106
 107     # all obtained by exhaustive testing
 108     _COUNTRIES_MAP = {
 109         'DE_FR': (
 110             'BL', 'DE', 'FR', 'GF', 'GP', 'MF', 'MQ', 'NC',
 111             'PF', 'PM', 'RE', 'WF', 'YT',
 112         ),
 113         # with both of the below 'BE' sometimes works, sometimes doesn't
 114         'EUR_DE_FR': (
 115             'AT', 'BL', 'CH', 'DE', 'FR', 'GF', 'GP', 'LI',
 116             'MC', 'MF', 'MQ', 'NC', 'PF', 'PM', 'RE', 'WF',
 117             'YT',
 118         ),
 119         'SAT': (
 120             'AD', 'AT', 'AX', 'BG', 'BL', 'CH', 'CY', 'CZ',
 121             'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GF',
 122             'GR', 'HR', 'HU', 'IE', 'IS', 'IT', 'KN', 'LI',
 123             'LT', 'LU', 'LV', 'MC', 'MF', 'MQ', 'MT', 'NC',
 124             'NL', 'NO', 'PF', 'PL', 'PM', 'PT', 'RE', 'RO',
 125             'SE', 'SI', 'SK', 'SM', 'VA', 'WF', 'YT',
 126         ),
 127     }
 128
 129     @staticmethod
 130     def _fix_accessible_subs_locale(subs):
 131         updated_subs = {}
 132         for lang, sub_formats in subs.items():
 133             for fmt in sub_formats:
 134                 url = fmt.get('url') or ''
 135                 suffix = ('acc' if url.endswith('-MAL.m3u8')
 136                           else 'forced' if '_VO' not in url
 137                           else None)
 138                 updated_subs.setdefault(join_nonempty(lang, suffix), []).append(fmt)
 139         return updated_subs
 140
 141     def _real_extract(self, url):
 142         mobj = self._match_valid_url(url)
 143         video_id = mobj.group('id')
 144         lang = mobj.group('lang') or mobj.group('lang_2')
 145         language_code = self._LANG_MAP.get(lang)
 146
 147         config = self._download_json(f'{self._API_BASE}/config/{lang}/{video_id}', video_id, headers={
 148             'x-validated-age': '18'
 149         })
 150
 151         geoblocking = traverse_obj(config, ('data', 'attributes', 'restriction', 'geoblocking')) or {}
 152         if geoblocking.get('restrictedArea'):
 153             raise GeoRestrictedError(f'Video restricted to {geoblocking["code"]!r}',
 154                                      countries=self._COUNTRIES_MAP.get(geoblocking['code'], ('DE', 'FR')))
 155
 156         if not traverse_obj(config, ('data', 'attributes', 'rights')):
 157             # Eg: https://www.arte.tv/de/videos/097407-215-A/28-minuten
 158             # Eg: https://www.arte.tv/es/videos/104351-002-A/serviteur-du-peuple-1-23
 159             raise ExtractorError(
 160                 'Video is not available in this language edition of Arte or broadcast rights expired', expected=True)
 161
 162         formats, subtitles = [], {}
 163         secondary_formats = []
 164         for stream in config['data']['attributes']['streams']:
 165             # official player contains code like `e.get("versions")[0].eStat.ml5`
 166             stream_version = stream['versions'][0]
 167             stream_version_code = stream_version['eStat']['ml5']
 168
 169             lang_pref = -1
 170             m = self._VERSION_CODE_RE.match(stream_version_code)
 171             if m:
 172                 lang_pref = int(''.join('01'[x] for x in (
 173                     m.group('vlang') == language_code,      # we prefer voice in the requested language
 174                     not m.group('audio_desc'),              # and not the audio description version
 175                     bool(m.group('original_voice')),        # but if voice is not in the requested language, at least choose the original voice
 176                     m.group('sub_lang') == language_code,   # if subtitles are present, we prefer them in the requested language
 177                     not m.group('has_sub'),                 # but we prefer no subtitles otherwise
 178                     not m.group('sdh_sub'),                 # and we prefer not the hard-of-hearing subtitles if there are subtitles
 179                 )))
 180
 181             short_label = traverse_obj(stream_version, 'shortLabel', expected_type=str, default='?')
 182             if 'HLS' in stream['protocol']:
 183                 fmts, subs = self._extract_m3u8_formats_and_subtitles(
 184                     stream['url'], video_id=video_id, ext='mp4', m3u8_id=stream_version_code, fatal=False)
 185                 for fmt in fmts:
 186                     fmt.update({
 187                         'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
 188                         'language_preference': lang_pref,
 189                     })
 190                 if any(map(short_label.startswith, ('cc', 'OGsub'))):
 191                     secondary_formats.extend(fmts)
 192                 else:
 193                     formats.extend(fmts)
 194                 subs = self._fix_accessible_subs_locale(subs)
 195                 self._merge_subtitles(subs, target=subtitles)
 196
 197             elif stream['protocol'] in ('HTTPS', 'RTMP'):
 198                 formats.append({
 199                     'format_id': f'{stream["protocol"]}-{stream_version_code}',
 200                     'url': stream['url'],
 201                     'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
 202                     'language_preference': lang_pref,
 203                     # 'ext': 'mp4',  # XXX: may or may not be necessary, at least for HTTPS
 204                 })
 205
 206             else:
 207                 self.report_warning(f'Skipping stream with unknown protocol {stream["protocol"]}')
 208
 209         formats.extend(secondary_formats)
 210         self._remove_duplicate_formats(formats)
 211
 212         metadata = config['data']['attributes']['metadata']
 213
 214         return {
 215             'id': metadata['providerId'],
 216             'webpage_url': traverse_obj(metadata, ('link', 'url')),
 217             'title': traverse_obj(metadata, 'subtitle', 'title'),
 218             'alt_title': metadata.get('subtitle') and metadata.get('title'),
 219             'description': metadata.get('description'),
 220             'duration': traverse_obj(metadata, ('duration', 'seconds')),
 221             'language': metadata.get('language'),
 222             'timestamp': traverse_obj(config, ('data', 'attributes', 'rights', 'begin'), expected_type=parse_iso8601),
 223             'is_live': config['data']['attributes'].get('live', False),
 224             'formats': formats,
 225             'subtitles': subtitles,
 226             'thumbnails': [
 227                 {'url': image['url'], 'id': image.get('caption')}
 228                 for image in metadata.get('images') or [] if url_or_none(image.get('url'))
 229             ],
 230             # TODO: chapters may also be in stream['segments']?
 231             'chapters': traverse_obj(config, ('data', 'attributes', 'chapters', 'elements', ..., {
 232                 'start_time': 'startTime',
 233                 'title': 'title',
 234             })) or None,
 235         }
 236
 237
 238 class ArteTVEmbedIE(InfoExtractor):
 239     _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
 240     _EMBED_REGEX = [r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1']
 241     _TESTS = [{
 242         'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
 243         'info_dict': {
 244             'id': '100605-013-A',
 245             'ext': 'mp4',
 246             'title': 'United we Stream November Lockdown Edition #13',
 247             'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
 248             'upload_date': '20201116',
 249         },
 250         'skip': 'No video available'
 251     }, {
 252         'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
 253         'only_matching': True,
 254     }]
 255
 256     def _real_extract(self, url):
 257         qs = parse_qs(url)
 258         json_url = qs['json_url'][0]
 259         video_id = ArteTVIE._match_id(json_url)
 260         return self.url_result(
 261             json_url, ie=ArteTVIE.ie_key(), video_id=video_id)
 262
 263
 264 class ArteTVPlaylistIE(ArteTVBaseIE):
 265     _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES
 266     _TESTS = [{
 267         'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
 268         'only_matching': True,
 269     }, {
 270         'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
 271         'playlist_mincount': 100,
 272         'info_dict': {
 273             'description': 'md5:84e7bf1feda248bc325ebfac818c476e',
 274             'id': 'RC-014123',
 275             'title': 'ARTE Reportage - najlepsze reportaże',
 276         },
 277     }]
 278
 279     def _real_extract(self, url):
 280         lang, playlist_id = self._match_valid_url(url).group('lang', 'id')
 281         playlist = self._download_json(
 282             f'{self._API_BASE}/playlist/{lang}/{playlist_id}', playlist_id)['data']['attributes']
 283
 284         entries = [{
 285             '_type': 'url_transparent',
 286             'url': video['config']['url'],
 287             'ie_key': ArteTVIE.ie_key(),
 288             'id': video.get('providerId'),
 289             'title': video.get('title'),
 290             'alt_title': video.get('subtitle'),
 291             'thumbnail': url_or_none(traverse_obj(video, ('mainImage', 'url'))),
 292             'duration': int_or_none(traverse_obj(video, ('duration', 'seconds'))),
 293         } for video in traverse_obj(playlist, ('items', lambda _, v: v['config']['url']))]
 294
 295         return self.playlist_result(entries, playlist_id,
 296                                     traverse_obj(playlist, ('metadata', 'title')),
 297                                     traverse_obj(playlist, ('metadata', 'description')))
 298
 299
 300 class ArteTVCategoryIE(ArteTVBaseIE):
 301     _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE._ARTE_LANGUAGES
 302     _TESTS = [{
 303         'url': 'https://www.arte.tv/en/videos/politics-and-society/',
 304         'info_dict': {
 305             'id': 'politics-and-society',
 306             'title': 'Politics and society',
 307             'description': 'Investigative documentary series, geopolitical analysis, and international commentary',
 308         },
 309         'playlist_mincount': 13,
 310     }]
 311
 312     @classmethod
 313     def suitable(cls, url):
 314         return (
 315             not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, ))
 316             and super().suitable(url))
 317
 318     def _real_extract(self, url):
 319         lang, playlist_id = self._match_valid_url(url).groups()
 320         webpage = self._download_webpage(url, playlist_id)
 321
 322         items = []
 323         for video in re.finditer(
 324                 r'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|\b)(?P<url>https?://www\.arte\.tv/%s/videos/[\w/-]+)(?P=q)' % lang,
 325                 webpage):
 326             video = video.group('url')
 327             if video == url:
 328                 continue
 329             if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )):
 330                 items.append(video)
 331
 332         title = strip_or_none(self._generic_title('', webpage, default='').rsplit('|', 1)[0]) or None
 333
 334         return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title,
 335                                           description=self._og_search_description(webpage, default=None))