yt_dlp/extractor/arte.py

   1 import re
   2
   3 from .common import InfoExtractor
   4 from ..utils import (
   5     ExtractorError,
   6     GeoRestrictedError,
   7     int_or_none,
   8     parse_iso8601,
   9     parse_qs,
  10     strip_or_none,
  11     traverse_obj,
  12     url_or_none,
  13 )
  14
  15
  16 class ArteTVBaseIE(InfoExtractor):
  17     _ARTE_LANGUAGES = 'fr|de|en|es|it|pl'
  18     _API_BASE = 'https://api.arte.tv/api/player/v2'
  19
  20
  21 class ArteTVIE(ArteTVBaseIE):
  22     _VALID_URL = r'''(?x)
  23                     (?:https?://
  24                         (?:
  25                             (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos|
  26                             api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
  27                         )
  28                     |arte://program)
  29                         /(?P<id>\d{6}-\d{3}-[AF]|LIVE)
  30                     ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
  31     _TESTS = [{
  32         'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
  33         'only_matching': True,
  34     }, {
  35         'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/',
  36         'info_dict': {
  37             'id': '100103-000-A',
  38             'title': 'USA: Dyskryminacja na porodówce',
  39             'description': 'md5:242017b7cce59ffae340a54baefcafb1',
  40             'alt_title': 'ARTE Reportage',
  41             'upload_date': '20201103',
  42             'duration': 554,
  43             'thumbnail': r're:https://api-cdn\.arte\.tv/.+940x530',
  44             'timestamp': 1604417980,
  45             'ext': 'mp4',
  46         },
  47         'params': {'skip_download': 'm3u8'}
  48     }, {
  49         'note': 'No alt_title',
  50         'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/',
  51         'only_matching': True,
  52     }, {
  53         'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
  54         'only_matching': True,
  55     }, {
  56         'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE',
  57         'only_matching': True,
  58     }, {
  59         'url': 'https://www.arte.tv/de/videos/110203-006-A/zaz/',
  60         'only_matching': True,
  61     }, {
  62         'note': 'age-restricted',
  63         'url': 'https://www.arte.tv/de/videos/006785-000-A/the-element-of-crime/',
  64         'info_dict': {
  65             'id': '006785-000-A',
  66             'description': 'md5:c2f94fdfefc8a280e4dab68ab96ab0ba',
  67             'title': 'The Element of Crime',
  68             'timestamp': 1696111200,
  69             'duration': 5849,
  70             'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/q82dTTfyuCXupPsGxXsd7B/940x530',
  71             'upload_date': '20230930',
  72             'ext': 'mp4',
  73         }
  74     }]
  75
  76     _GEO_BYPASS = True
  77
  78     _LANG_MAP = {  # ISO639 -> French abbreviations
  79         'fr': 'F',
  80         'de': 'A',
  81         'en': 'E[ANG]',
  82         'es': 'E[ESP]',
  83         'it': 'E[ITA]',
  84         'pl': 'E[POL]',
  85         # XXX: probably means mixed; <https://www.arte.tv/en/videos/107710-029-A/dispatches-from-ukraine-local-journalists-report/>
  86         # uses this code for audio that happens to be in Ukrainian, but the manifest uses the ISO code 'mul' (mixed)
  87         'mul': 'EU',
  88     }
  89
  90     _VERSION_CODE_RE = re.compile(r'''(?x)
  91         V
  92         (?P<original_voice>O?)
  93         (?P<vlang>[FA]|E\[[A-Z]+\]|EU)?
  94         (?P<audio_desc>AUD|)
  95         (?:
  96             (?P<has_sub>-ST)
  97             (?P<sdh_sub>M?)
  98             (?P<sub_lang>[FA]|E\[[A-Z]+\]|EU)
  99         )?
 100     ''')
 101
 102     # all obtained by exhaustive testing
 103     _COUNTRIES_MAP = {
 104         'DE_FR': (
 105             'BL', 'DE', 'FR', 'GF', 'GP', 'MF', 'MQ', 'NC',
 106             'PF', 'PM', 'RE', 'WF', 'YT',
 107         ),
 108         # with both of the below 'BE' sometimes works, sometimes doesn't
 109         'EUR_DE_FR': (
 110             'AT', 'BL', 'CH', 'DE', 'FR', 'GF', 'GP', 'LI',
 111             'MC', 'MF', 'MQ', 'NC', 'PF', 'PM', 'RE', 'WF',
 112             'YT',
 113         ),
 114         'SAT': (
 115             'AD', 'AT', 'AX', 'BG', 'BL', 'CH', 'CY', 'CZ',
 116             'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GF',
 117             'GR', 'HR', 'HU', 'IE', 'IS', 'IT', 'KN', 'LI',
 118             'LT', 'LU', 'LV', 'MC', 'MF', 'MQ', 'MT', 'NC',
 119             'NL', 'NO', 'PF', 'PL', 'PM', 'PT', 'RE', 'RO',
 120             'SE', 'SI', 'SK', 'SM', 'VA', 'WF', 'YT',
 121         ),
 122     }
 123
 124     def _real_extract(self, url):
 125         mobj = self._match_valid_url(url)
 126         video_id = mobj.group('id')
 127         lang = mobj.group('lang') or mobj.group('lang_2')
 128         langauge_code = self._LANG_MAP.get(lang)
 129
 130         config = self._download_json(f'{self._API_BASE}/config/{lang}/{video_id}', video_id, headers={
 131             'x-validated-age': '18'
 132         })
 133
 134         geoblocking = traverse_obj(config, ('data', 'attributes', 'restriction', 'geoblocking')) or {}
 135         if geoblocking.get('restrictedArea'):
 136             raise GeoRestrictedError(f'Video restricted to {geoblocking["code"]!r}',
 137                                      countries=self._COUNTRIES_MAP.get(geoblocking['code'], ('DE', 'FR')))
 138
 139         if not traverse_obj(config, ('data', 'attributes', 'rights')):
 140             # Eg: https://www.arte.tv/de/videos/097407-215-A/28-minuten
 141             # Eg: https://www.arte.tv/es/videos/104351-002-A/serviteur-du-peuple-1-23
 142             raise ExtractorError(
 143                 'Video is not available in this language edition of Arte or broadcast rights expired', expected=True)
 144
 145         formats, subtitles = [], {}
 146         secondary_formats = []
 147         for stream in config['data']['attributes']['streams']:
 148             # official player contains code like `e.get("versions")[0].eStat.ml5`
 149             stream_version = stream['versions'][0]
 150             stream_version_code = stream_version['eStat']['ml5']
 151
 152             lang_pref = -1
 153             m = self._VERSION_CODE_RE.match(stream_version_code)
 154             if m:
 155                 lang_pref = int(''.join('01'[x] for x in (
 156                     m.group('vlang') == langauge_code,      # we prefer voice in the requested language
 157                     not m.group('audio_desc'),              # and not the audio description version
 158                     bool(m.group('original_voice')),        # but if voice is not in the requested language, at least choose the original voice
 159                     m.group('sub_lang') == langauge_code,   # if subtitles are present, we prefer them in the requested language
 160                     not m.group('has_sub'),                 # but we prefer no subtitles otherwise
 161                     not m.group('sdh_sub'),                 # and we prefer not the hard-of-hearing subtitles if there are subtitles
 162                 )))
 163
 164             short_label = traverse_obj(stream_version, 'shortLabel', expected_type=str, default='?')
 165             if 'HLS' in stream['protocol']:
 166                 fmts, subs = self._extract_m3u8_formats_and_subtitles(
 167                     stream['url'], video_id=video_id, ext='mp4', m3u8_id=stream_version_code, fatal=False)
 168                 for fmt in fmts:
 169                     fmt.update({
 170                         'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
 171                         'language_preference': lang_pref,
 172                     })
 173                 if any(map(short_label.startswith, ('cc', 'OGsub'))):
 174                     secondary_formats.extend(fmts)
 175                 else:
 176                     formats.extend(fmts)
 177                 self._merge_subtitles(subs, target=subtitles)
 178
 179             elif stream['protocol'] in ('HTTPS', 'RTMP'):
 180                 formats.append({
 181                     'format_id': f'{stream["protocol"]}-{stream_version_code}',
 182                     'url': stream['url'],
 183                     'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
 184                     'language_preference': lang_pref,
 185                     # 'ext': 'mp4',  # XXX: may or may not be necessary, at least for HTTPS
 186                 })
 187
 188             else:
 189                 self.report_warning(f'Skipping stream with unknown protocol {stream["protocol"]}')
 190
 191         formats.extend(secondary_formats)
 192         self._remove_duplicate_formats(formats)
 193
 194         metadata = config['data']['attributes']['metadata']
 195
 196         return {
 197             'id': metadata['providerId'],
 198             'webpage_url': traverse_obj(metadata, ('link', 'url')),
 199             'title': traverse_obj(metadata, 'subtitle', 'title'),
 200             'alt_title': metadata.get('subtitle') and metadata.get('title'),
 201             'description': metadata.get('description'),
 202             'duration': traverse_obj(metadata, ('duration', 'seconds')),
 203             'language': metadata.get('language'),
 204             'timestamp': traverse_obj(config, ('data', 'attributes', 'rights', 'begin'), expected_type=parse_iso8601),
 205             'is_live': config['data']['attributes'].get('live', False),
 206             'formats': formats,
 207             'subtitles': subtitles,
 208             'thumbnails': [
 209                 {'url': image['url'], 'id': image.get('caption')}
 210                 for image in metadata.get('images') or [] if url_or_none(image.get('url'))
 211             ],
 212             # TODO: chapters may also be in stream['segments']?
 213             'chapters': traverse_obj(config, ('data', 'attributes', 'chapters', 'elements', ..., {
 214                 'start_time': 'startTime',
 215                 'title': 'title',
 216             })) or None,
 217         }
 218
 219
 220 class ArteTVEmbedIE(InfoExtractor):
 221     _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
 222     _EMBED_REGEX = [r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1']
 223     _TESTS = [{
 224         'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
 225         'info_dict': {
 226             'id': '100605-013-A',
 227             'ext': 'mp4',
 228             'title': 'United we Stream November Lockdown Edition #13',
 229             'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
 230             'upload_date': '20201116',
 231         },
 232         'skip': 'No video available'
 233     }, {
 234         'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
 235         'only_matching': True,
 236     }]
 237
 238     def _real_extract(self, url):
 239         qs = parse_qs(url)
 240         json_url = qs['json_url'][0]
 241         video_id = ArteTVIE._match_id(json_url)
 242         return self.url_result(
 243             json_url, ie=ArteTVIE.ie_key(), video_id=video_id)
 244
 245
 246 class ArteTVPlaylistIE(ArteTVBaseIE):
 247     _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES
 248     _TESTS = [{
 249         'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
 250         'only_matching': True,
 251     }, {
 252         'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
 253         'playlist_mincount': 100,
 254         'info_dict': {
 255             'description': 'md5:84e7bf1feda248bc325ebfac818c476e',
 256             'id': 'RC-014123',
 257             'title': 'ARTE Reportage - najlepsze reportaże',
 258         },
 259     }]
 260
 261     def _real_extract(self, url):
 262         lang, playlist_id = self._match_valid_url(url).group('lang', 'id')
 263         playlist = self._download_json(
 264             f'{self._API_BASE}/playlist/{lang}/{playlist_id}', playlist_id)['data']['attributes']
 265
 266         entries = [{
 267             '_type': 'url_transparent',
 268             'url': video['config']['url'],
 269             'ie_key': ArteTVIE.ie_key(),
 270             'id': video.get('providerId'),
 271             'title': video.get('title'),
 272             'alt_title': video.get('subtitle'),
 273             'thumbnail': url_or_none(traverse_obj(video, ('mainImage', 'url'))),
 274             'duration': int_or_none(traverse_obj(video, ('duration', 'seconds'))),
 275         } for video in traverse_obj(playlist, ('items', lambda _, v: v['config']['url']))]
 276
 277         return self.playlist_result(entries, playlist_id,
 278                                     traverse_obj(playlist, ('metadata', 'title')),
 279                                     traverse_obj(playlist, ('metadata', 'description')))
 280
 281
 282 class ArteTVCategoryIE(ArteTVBaseIE):
 283     _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE._ARTE_LANGUAGES
 284     _TESTS = [{
 285         'url': 'https://www.arte.tv/en/videos/politics-and-society/',
 286         'info_dict': {
 287             'id': 'politics-and-society',
 288             'title': 'Politics and society',
 289             'description': 'Investigative documentary series, geopolitical analysis, and international commentary',
 290         },
 291         'playlist_mincount': 13,
 292     }]
 293
 294     @classmethod
 295     def suitable(cls, url):
 296         return (
 297             not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, ))
 298             and super().suitable(url))
 299
 300     def _real_extract(self, url):
 301         lang, playlist_id = self._match_valid_url(url).groups()
 302         webpage = self._download_webpage(url, playlist_id)
 303
 304         items = []
 305         for video in re.finditer(
 306                 r'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|\b)(?P<url>https?://www\.arte\.tv/%s/videos/[\w/-]+)(?P=q)' % lang,
 307                 webpage):
 308             video = video.group('url')
 309             if video == url:
 310                 continue
 311             if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )):
 312                 items.append(video)
 313
 314         title = strip_or_none(self._generic_title('', webpage, default='').rsplit('|', 1)[0]) or None
 315
 316         return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title,
 317                                           description=self._og_search_description(webpage, default=None))