yt_dlp/extractor/arte.py

   1 import re
   2
   3 from .common import InfoExtractor
   4 from ..utils import (
   5     ExtractorError,
   6     GeoRestrictedError,
   7     int_or_none,
   8     parse_iso8601,
   9     parse_qs,
  10     strip_or_none,
  11     traverse_obj,
  12     url_or_none,
  13 )
  14
  15
  16 class ArteTVBaseIE(InfoExtractor):
  17     _ARTE_LANGUAGES = 'fr|de|en|es|it|pl'
  18     _API_BASE = 'https://api.arte.tv/api/player/v2'
  19
  20
  21 class ArteTVIE(ArteTVBaseIE):
  22     _VALID_URL = r'''(?x)
  23                     (?:https?://
  24                         (?:
  25                             (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos|
  26                             api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
  27                         )
  28                     |arte://program)
  29                         /(?P<id>\d{6}-\d{3}-[AF]|LIVE)
  30                     ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
  31     _TESTS = [{
  32         'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
  33         'only_matching': True,
  34     }, {
  35         'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/',
  36         'info_dict': {
  37             'id': '100103-000-A',
  38             'title': 'USA: Dyskryminacja na porodówce',
  39             'description': 'md5:242017b7cce59ffae340a54baefcafb1',
  40             'alt_title': 'ARTE Reportage',
  41             'upload_date': '20201103',
  42             'duration': 554,
  43             'thumbnail': r're:https://api-cdn\.arte\.tv/.+940x530',
  44             'timestamp': 1604417980,
  45             'ext': 'mp4',
  46         },
  47         'params': {'skip_download': 'm3u8'}
  48     }, {
  49         'note': 'No alt_title',
  50         'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/',
  51         'only_matching': True,
  52     }, {
  53         'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
  54         'only_matching': True,
  55     }, {
  56         'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE',
  57         'only_matching': True,
  58     }, {
  59         'url': 'https://www.arte.tv/de/videos/110203-006-A/zaz/',
  60         'only_matching': True,
  61     }, {
  62         'note': 'age-restricted',
  63         'url': 'https://www.arte.tv/de/videos/006785-000-A/the-element-of-crime/',
  64         'info_dict': {
  65             'id': '006785-000-A',
  66             'description': 'md5:c2f94fdfefc8a280e4dab68ab96ab0ba',
  67             'title': 'The Element of Crime',
  68             'timestamp': 1696111200,
  69             'duration': 5849,
  70             'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/q82dTTfyuCXupPsGxXsd7B/940x530',
  71             'upload_date': '20230930',
  72             'ext': 'mp4',
  73         },
  74     }, {
  75         'url': 'https://www.arte.tv/de/videos/085374-003-A/im-hohen-norden-geboren/',
  76         'info_dict': {
  77             'id': '085374-003-A',
  78             'ext': 'mp4',
  79             'description': 'md5:ab79ec7cc472a93164415b4e4916abf9',
  80             'timestamp': 1702872000,
  81             'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/TnyHBfPxv3v2GEY3suXGZP/940x530',
  82             'duration': 2594,
  83             'title': 'Die kurze Zeit der Jugend',
  84             'alt_title': 'Im hohen Norden geboren',
  85             'upload_date': '20231218',
  86             'subtitles': {
  87                 'fr': 'mincount:1',
  88                 'fr-acc': 'mincount:1',
  89             },
  90         },
  91     }]
  92
  93     _GEO_BYPASS = True
  94
  95     _LANG_MAP = {  # ISO639 -> French abbreviations
  96         'fr': 'F',
  97         'de': 'A',
  98         'en': 'E[ANG]',
  99         'es': 'E[ESP]',
 100         'it': 'E[ITA]',
 101         'pl': 'E[POL]',
 102         # XXX: probably means mixed; <https://www.arte.tv/en/videos/107710-029-A/dispatches-from-ukraine-local-journalists-report/>
 103         # uses this code for audio that happens to be in Ukrainian, but the manifest uses the ISO code 'mul' (mixed)
 104         'mul': 'EU',
 105     }
 106
 107     _VERSION_CODE_RE = re.compile(r'''(?x)
 108         V
 109         (?P<original_voice>O?)
 110         (?P<vlang>[FA]|E\[[A-Z]+\]|EU)?
 111         (?P<audio_desc>AUD|)
 112         (?:
 113             (?P<has_sub>-ST)
 114             (?P<sdh_sub>M?)
 115             (?P<sub_lang>[FA]|E\[[A-Z]+\]|EU)
 116         )?
 117     ''')
 118
 119     # all obtained by exhaustive testing
 120     _COUNTRIES_MAP = {
 121         'DE_FR': (
 122             'BL', 'DE', 'FR', 'GF', 'GP', 'MF', 'MQ', 'NC',
 123             'PF', 'PM', 'RE', 'WF', 'YT',
 124         ),
 125         # with both of the below 'BE' sometimes works, sometimes doesn't
 126         'EUR_DE_FR': (
 127             'AT', 'BL', 'CH', 'DE', 'FR', 'GF', 'GP', 'LI',
 128             'MC', 'MF', 'MQ', 'NC', 'PF', 'PM', 'RE', 'WF',
 129             'YT',
 130         ),
 131         'SAT': (
 132             'AD', 'AT', 'AX', 'BG', 'BL', 'CH', 'CY', 'CZ',
 133             'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GF',
 134             'GR', 'HR', 'HU', 'IE', 'IS', 'IT', 'KN', 'LI',
 135             'LT', 'LU', 'LV', 'MC', 'MF', 'MQ', 'MT', 'NC',
 136             'NL', 'NO', 'PF', 'PL', 'PM', 'PT', 'RE', 'RO',
 137             'SE', 'SI', 'SK', 'SM', 'VA', 'WF', 'YT',
 138         ),
 139     }
 140
 141     @staticmethod
 142     def _fix_accessible_subs_locale(subs):
 143         updated_subs = {}
 144         for lang, sub_formats in subs.items():
 145             for fmt in sub_formats:
 146                 if fmt.get('url', '').endswith('-MAL.m3u8'):
 147                     lang += '-acc'
 148                 updated_subs.setdefault(lang, []).append(fmt)
 149         return updated_subs
 150
 151     def _real_extract(self, url):
 152         mobj = self._match_valid_url(url)
 153         video_id = mobj.group('id')
 154         lang = mobj.group('lang') or mobj.group('lang_2')
 155         langauge_code = self._LANG_MAP.get(lang)
 156
 157         config = self._download_json(f'{self._API_BASE}/config/{lang}/{video_id}', video_id, headers={
 158             'x-validated-age': '18'
 159         })
 160
 161         geoblocking = traverse_obj(config, ('data', 'attributes', 'restriction', 'geoblocking')) or {}
 162         if geoblocking.get('restrictedArea'):
 163             raise GeoRestrictedError(f'Video restricted to {geoblocking["code"]!r}',
 164                                      countries=self._COUNTRIES_MAP.get(geoblocking['code'], ('DE', 'FR')))
 165
 166         if not traverse_obj(config, ('data', 'attributes', 'rights')):
 167             # Eg: https://www.arte.tv/de/videos/097407-215-A/28-minuten
 168             # Eg: https://www.arte.tv/es/videos/104351-002-A/serviteur-du-peuple-1-23
 169             raise ExtractorError(
 170                 'Video is not available in this language edition of Arte or broadcast rights expired', expected=True)
 171
 172         formats, subtitles = [], {}
 173         secondary_formats = []
 174         for stream in config['data']['attributes']['streams']:
 175             # official player contains code like `e.get("versions")[0].eStat.ml5`
 176             stream_version = stream['versions'][0]
 177             stream_version_code = stream_version['eStat']['ml5']
 178
 179             lang_pref = -1
 180             m = self._VERSION_CODE_RE.match(stream_version_code)
 181             if m:
 182                 lang_pref = int(''.join('01'[x] for x in (
 183                     m.group('vlang') == langauge_code,      # we prefer voice in the requested language
 184                     not m.group('audio_desc'),              # and not the audio description version
 185                     bool(m.group('original_voice')),        # but if voice is not in the requested language, at least choose the original voice
 186                     m.group('sub_lang') == langauge_code,   # if subtitles are present, we prefer them in the requested language
 187                     not m.group('has_sub'),                 # but we prefer no subtitles otherwise
 188                     not m.group('sdh_sub'),                 # and we prefer not the hard-of-hearing subtitles if there are subtitles
 189                 )))
 190
 191             short_label = traverse_obj(stream_version, 'shortLabel', expected_type=str, default='?')
 192             if 'HLS' in stream['protocol']:
 193                 fmts, subs = self._extract_m3u8_formats_and_subtitles(
 194                     stream['url'], video_id=video_id, ext='mp4', m3u8_id=stream_version_code, fatal=False)
 195                 for fmt in fmts:
 196                     fmt.update({
 197                         'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
 198                         'language_preference': lang_pref,
 199                     })
 200                 if any(map(short_label.startswith, ('cc', 'OGsub'))):
 201                     secondary_formats.extend(fmts)
 202                 else:
 203                     formats.extend(fmts)
 204                 subs = self._fix_accessible_subs_locale(subs)
 205                 self._merge_subtitles(subs, target=subtitles)
 206
 207             elif stream['protocol'] in ('HTTPS', 'RTMP'):
 208                 formats.append({
 209                     'format_id': f'{stream["protocol"]}-{stream_version_code}',
 210                     'url': stream['url'],
 211                     'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
 212                     'language_preference': lang_pref,
 213                     # 'ext': 'mp4',  # XXX: may or may not be necessary, at least for HTTPS
 214                 })
 215
 216             else:
 217                 self.report_warning(f'Skipping stream with unknown protocol {stream["protocol"]}')
 218
 219         formats.extend(secondary_formats)
 220         self._remove_duplicate_formats(formats)
 221
 222         metadata = config['data']['attributes']['metadata']
 223
 224         return {
 225             'id': metadata['providerId'],
 226             'webpage_url': traverse_obj(metadata, ('link', 'url')),
 227             'title': traverse_obj(metadata, 'subtitle', 'title'),
 228             'alt_title': metadata.get('subtitle') and metadata.get('title'),
 229             'description': metadata.get('description'),
 230             'duration': traverse_obj(metadata, ('duration', 'seconds')),
 231             'language': metadata.get('language'),
 232             'timestamp': traverse_obj(config, ('data', 'attributes', 'rights', 'begin'), expected_type=parse_iso8601),
 233             'is_live': config['data']['attributes'].get('live', False),
 234             'formats': formats,
 235             'subtitles': subtitles,
 236             'thumbnails': [
 237                 {'url': image['url'], 'id': image.get('caption')}
 238                 for image in metadata.get('images') or [] if url_or_none(image.get('url'))
 239             ],
 240             # TODO: chapters may also be in stream['segments']?
 241             'chapters': traverse_obj(config, ('data', 'attributes', 'chapters', 'elements', ..., {
 242                 'start_time': 'startTime',
 243                 'title': 'title',
 244             })) or None,
 245         }
 246
 247
 248 class ArteTVEmbedIE(InfoExtractor):
 249     _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
 250     _EMBED_REGEX = [r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1']
 251     _TESTS = [{
 252         'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
 253         'info_dict': {
 254             'id': '100605-013-A',
 255             'ext': 'mp4',
 256             'title': 'United we Stream November Lockdown Edition #13',
 257             'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
 258             'upload_date': '20201116',
 259         },
 260         'skip': 'No video available'
 261     }, {
 262         'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
 263         'only_matching': True,
 264     }]
 265
 266     def _real_extract(self, url):
 267         qs = parse_qs(url)
 268         json_url = qs['json_url'][0]
 269         video_id = ArteTVIE._match_id(json_url)
 270         return self.url_result(
 271             json_url, ie=ArteTVIE.ie_key(), video_id=video_id)
 272
 273
 274 class ArteTVPlaylistIE(ArteTVBaseIE):
 275     _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES
 276     _TESTS = [{
 277         'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
 278         'only_matching': True,
 279     }, {
 280         'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
 281         'playlist_mincount': 100,
 282         'info_dict': {
 283             'description': 'md5:84e7bf1feda248bc325ebfac818c476e',
 284             'id': 'RC-014123',
 285             'title': 'ARTE Reportage - najlepsze reportaże',
 286         },
 287     }]
 288
 289     def _real_extract(self, url):
 290         lang, playlist_id = self._match_valid_url(url).group('lang', 'id')
 291         playlist = self._download_json(
 292             f'{self._API_BASE}/playlist/{lang}/{playlist_id}', playlist_id)['data']['attributes']
 293
 294         entries = [{
 295             '_type': 'url_transparent',
 296             'url': video['config']['url'],
 297             'ie_key': ArteTVIE.ie_key(),
 298             'id': video.get('providerId'),
 299             'title': video.get('title'),
 300             'alt_title': video.get('subtitle'),
 301             'thumbnail': url_or_none(traverse_obj(video, ('mainImage', 'url'))),
 302             'duration': int_or_none(traverse_obj(video, ('duration', 'seconds'))),
 303         } for video in traverse_obj(playlist, ('items', lambda _, v: v['config']['url']))]
 304
 305         return self.playlist_result(entries, playlist_id,
 306                                     traverse_obj(playlist, ('metadata', 'title')),
 307                                     traverse_obj(playlist, ('metadata', 'description')))
 308
 309
 310 class ArteTVCategoryIE(ArteTVBaseIE):
 311     _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE._ARTE_LANGUAGES
 312     _TESTS = [{
 313         'url': 'https://www.arte.tv/en/videos/politics-and-society/',
 314         'info_dict': {
 315             'id': 'politics-and-society',
 316             'title': 'Politics and society',
 317             'description': 'Investigative documentary series, geopolitical analysis, and international commentary',
 318         },
 319         'playlist_mincount': 13,
 320     }]
 321
 322     @classmethod
 323     def suitable(cls, url):
 324         return (
 325             not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, ))
 326             and super().suitable(url))
 327
 328     def _real_extract(self, url):
 329         lang, playlist_id = self._match_valid_url(url).groups()
 330         webpage = self._download_webpage(url, playlist_id)
 331
 332         items = []
 333         for video in re.finditer(
 334                 r'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|\b)(?P<url>https?://www\.arte\.tv/%s/videos/[\w/-]+)(?P=q)' % lang,
 335                 webpage):
 336             video = video.group('url')
 337             if video == url:
 338                 continue
 339             if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )):
 340                 items.append(video)
 341
 342         title = strip_or_none(self._generic_title('', webpage, default='').rsplit('|', 1)[0]) or None
 343
 344         return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title,
 345                                           description=self._og_search_description(webpage, default=None))