yt_dlp/extractor/arte.py

   1 import re
   2
   3 from .common import InfoExtractor
   4 from ..utils import (
   5     ExtractorError,
   6     GeoRestrictedError,
   7     int_or_none,
   8     parse_iso8601,
   9     parse_qs,
  10     strip_or_none,
  11     traverse_obj,
  12     url_or_none,
  13 )
  14
  15
  16 class ArteTVBaseIE(InfoExtractor):
  17     _ARTE_LANGUAGES = 'fr|de|en|es|it|pl'
  18     _API_BASE = 'https://api.arte.tv/api/player/v2'
  19
  20
  21 class ArteTVIE(ArteTVBaseIE):
  22     _VALID_URL = r'''(?x)
  23                     (?:https?://
  24                         (?:
  25                             (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos|
  26                             api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
  27                         )
  28                     |arte://program)
  29                         /(?P<id>\d{6}-\d{3}-[AF]|LIVE)
  30                     ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
  31     _TESTS = [{
  32         'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
  33         'only_matching': True,
  34     }, {
  35         'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/',
  36         'info_dict': {
  37             'id': '100103-000-A',
  38             'title': 'USA: Dyskryminacja na porodówce',
  39             'description': 'md5:242017b7cce59ffae340a54baefcafb1',
  40             'alt_title': 'ARTE Reportage',
  41             'upload_date': '20201103',
  42             'duration': 554,
  43             'thumbnail': r're:https://api-cdn\.arte\.tv/.+940x530',
  44             'timestamp': 1604417980,
  45             'ext': 'mp4',
  46         },
  47         'params': {'skip_download': 'm3u8'}
  48     }, {
  49         'note': 'No alt_title',
  50         'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/',
  51         'info_dict': {
  52             'id': '110371-000-A',
  53             'ext': 'mp4',
  54             'upload_date': '20220718',
  55             'duration': 154,
  56             'timestamp': 1658162460,
  57             'description': 'md5:5890f36fe7dccfadb8b7c0891de54786',
  58             'title': 'La chaleur, supplice des arbres de rue',
  59             'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/CPE2sQDtD8GLQgt8DuYHLf/940x530',
  60         },
  61         'params': {'skip_download': 'm3u8'}
  62     }, {
  63         'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
  64         'only_matching': True,
  65     }, {
  66         'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE',
  67         'only_matching': True,
  68     }, {
  69         'url': 'https://www.arte.tv/de/videos/110203-006-A/zaz/',
  70         'info_dict': {
  71             'id': '110203-006-A',
  72             'chapters': 'count:16',
  73             'description': 'md5:cf592f1df52fe52007e3f8eac813c084',
  74             'alt_title': 'Zaz',
  75             'title': 'Baloise Session 2022',
  76             'timestamp': 1668445200,
  77             'duration': 4054,
  78             'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/ubQjmVCGyRx3hmBuZEK9QZ/940x530',
  79             'upload_date': '20221114',
  80             'ext': 'mp4',
  81         },
  82         'expected_warnings': ['geo restricted']
  83     }]
  84
  85     _GEO_BYPASS = True
  86
  87     _LANG_MAP = {  # ISO639 -> French abbreviations
  88         'fr': 'F',
  89         'de': 'A',
  90         'en': 'E[ANG]',
  91         'es': 'E[ESP]',
  92         'it': 'E[ITA]',
  93         'pl': 'E[POL]',
  94         # XXX: probably means mixed; <https://www.arte.tv/en/videos/107710-029-A/dispatches-from-ukraine-local-journalists-report/>
  95         # uses this code for audio that happens to be in Ukrainian, but the manifest uses the ISO code 'mul' (mixed)
  96         'mul': 'EU',
  97     }
  98
  99     _VERSION_CODE_RE = re.compile(r'''(?x)
 100         V
 101         (?P<original_voice>O?)
 102         (?P<vlang>[FA]|E\[[A-Z]+\]|EU)?
 103         (?P<audio_desc>AUD|)
 104         (?:
 105             (?P<has_sub>-ST)
 106             (?P<sdh_sub>M?)
 107             (?P<sub_lang>[FA]|E\[[A-Z]+\]|EU)
 108         )?
 109     ''')
 110
 111     # all obtained by exhaustive testing
 112     _COUNTRIES_MAP = {
 113         'DE_FR': (
 114             'BL', 'DE', 'FR', 'GF', 'GP', 'MF', 'MQ', 'NC',
 115             'PF', 'PM', 'RE', 'WF', 'YT',
 116         ),
 117         # with both of the below 'BE' sometimes works, sometimes doesn't
 118         'EUR_DE_FR': (
 119             'AT', 'BL', 'CH', 'DE', 'FR', 'GF', 'GP', 'LI',
 120             'MC', 'MF', 'MQ', 'NC', 'PF', 'PM', 'RE', 'WF',
 121             'YT',
 122         ),
 123         'SAT': (
 124             'AD', 'AT', 'AX', 'BG', 'BL', 'CH', 'CY', 'CZ',
 125             'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GF',
 126             'GR', 'HR', 'HU', 'IE', 'IS', 'IT', 'KN', 'LI',
 127             'LT', 'LU', 'LV', 'MC', 'MF', 'MQ', 'MT', 'NC',
 128             'NL', 'NO', 'PF', 'PL', 'PM', 'PT', 'RE', 'RO',
 129             'SE', 'SI', 'SK', 'SM', 'VA', 'WF', 'YT',
 130         ),
 131     }
 132
 133     def _real_extract(self, url):
 134         mobj = self._match_valid_url(url)
 135         video_id = mobj.group('id')
 136         lang = mobj.group('lang') or mobj.group('lang_2')
 137         langauge_code = self._LANG_MAP.get(lang)
 138
 139         config = self._download_json(f'{self._API_BASE}/config/{lang}/{video_id}', video_id)
 140
 141         geoblocking = traverse_obj(config, ('data', 'attributes', 'restriction', 'geoblocking')) or {}
 142         if geoblocking.get('restrictedArea'):
 143             raise GeoRestrictedError(f'Video restricted to {geoblocking["code"]!r}',
 144                                      countries=self._COUNTRIES_MAP.get(geoblocking['code'], ('DE', 'FR')))
 145
 146         if not traverse_obj(config, ('data', 'attributes', 'rights')):
 147             # Eg: https://www.arte.tv/de/videos/097407-215-A/28-minuten
 148             # Eg: https://www.arte.tv/es/videos/104351-002-A/serviteur-du-peuple-1-23
 149             raise ExtractorError(
 150                 'Video is not available in this language edition of Arte or broadcast rights expired', expected=True)
 151
 152         formats, subtitles = [], {}
 153         secondary_formats = []
 154         for stream in config['data']['attributes']['streams']:
 155             # official player contains code like `e.get("versions")[0].eStat.ml5`
 156             stream_version = stream['versions'][0]
 157             stream_version_code = stream_version['eStat']['ml5']
 158
 159             lang_pref = -1
 160             m = self._VERSION_CODE_RE.match(stream_version_code)
 161             if m:
 162                 lang_pref = int(''.join('01'[x] for x in (
 163                     m.group('vlang') == langauge_code,      # we prefer voice in the requested language
 164                     not m.group('audio_desc'),              # and not the audio description version
 165                     bool(m.group('original_voice')),        # but if voice is not in the requested language, at least choose the original voice
 166                     m.group('sub_lang') == langauge_code,   # if subtitles are present, we prefer them in the requested language
 167                     not m.group('has_sub'),                 # but we prefer no subtitles otherwise
 168                     not m.group('sdh_sub'),                 # and we prefer not the hard-of-hearing subtitles if there are subtitles
 169                 )))
 170
 171             short_label = traverse_obj(stream_version, 'shortLabel', expected_type=str, default='?')
 172             if stream['protocol'].startswith('HLS'):
 173                 fmts, subs = self._extract_m3u8_formats_and_subtitles(
 174                     stream['url'], video_id=video_id, ext='mp4', m3u8_id=stream_version_code, fatal=False)
 175                 for fmt in fmts:
 176                     fmt.update({
 177                         'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
 178                         'language_preference': lang_pref,
 179                     })
 180                 if any(map(short_label.startswith, ('cc', 'OGsub'))):
 181                     secondary_formats.extend(fmts)
 182                 else:
 183                     formats.extend(fmts)
 184                 self._merge_subtitles(subs, target=subtitles)
 185
 186             elif stream['protocol'] in ('HTTPS', 'RTMP'):
 187                 formats.append({
 188                     'format_id': f'{stream["protocol"]}-{stream_version_code}',
 189                     'url': stream['url'],
 190                     'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
 191                     'language_preference': lang_pref,
 192                     # 'ext': 'mp4',  # XXX: may or may not be necessary, at least for HTTPS
 193                 })
 194
 195             else:
 196                 self.report_warning(f'Skipping stream with unknown protocol {stream["protocol"]}')
 197
 198
 199         formats.extend(secondary_formats)
 200         self._remove_duplicate_formats(formats)
 201
 202         metadata = config['data']['attributes']['metadata']
 203
 204         return {
 205             'id': metadata['providerId'],
 206             'webpage_url': traverse_obj(metadata, ('link', 'url')),
 207             'title': traverse_obj(metadata, 'subtitle', 'title'),
 208             'alt_title': metadata.get('subtitle') and metadata.get('title'),
 209             'description': metadata.get('description'),
 210             'duration': traverse_obj(metadata, ('duration', 'seconds')),
 211             'language': metadata.get('language'),
 212             'timestamp': traverse_obj(config, ('data', 'attributes', 'rights', 'begin'), expected_type=parse_iso8601),
 213             'is_live': config['data']['attributes'].get('live', False),
 214             'formats': formats,
 215             'subtitles': subtitles,
 216             'thumbnails': [
 217                 {'url': image['url'], 'id': image.get('caption')}
 218                 for image in metadata.get('images') or [] if url_or_none(image.get('url'))
 219             ],
 220             # TODO: chapters may also be in stream['segments']?
 221             'chapters': traverse_obj(config, ('data', 'attributes', 'chapters', 'elements', ..., {
 222                 'start_time': 'startTime',
 223                 'title': 'title',
 224             })) or None,
 225         }
 226
 227
 228 class ArteTVEmbedIE(InfoExtractor):
 229     _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
 230     _EMBED_REGEX = [r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1']
 231     _TESTS = [{
 232         'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
 233         'info_dict': {
 234             'id': '100605-013-A',
 235             'ext': 'mp4',
 236             'title': 'United we Stream November Lockdown Edition #13',
 237             'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
 238             'upload_date': '20201116',
 239         },
 240         'skip': 'No video available'
 241     }, {
 242         'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
 243         'only_matching': True,
 244     }]
 245
 246     def _real_extract(self, url):
 247         qs = parse_qs(url)
 248         json_url = qs['json_url'][0]
 249         video_id = ArteTVIE._match_id(json_url)
 250         return self.url_result(
 251             json_url, ie=ArteTVIE.ie_key(), video_id=video_id)
 252
 253
 254 class ArteTVPlaylistIE(ArteTVBaseIE):
 255     _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES
 256     _TESTS = [{
 257         'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
 258         'only_matching': True,
 259     }, {
 260         'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
 261         'playlist_mincount': 100,
 262         'info_dict': {
 263             'description': 'md5:84e7bf1feda248bc325ebfac818c476e',
 264             'id': 'RC-014123',
 265             'title': 'ARTE Reportage - najlepsze reportaże',
 266         },
 267     }]
 268
 269     def _real_extract(self, url):
 270         lang, playlist_id = self._match_valid_url(url).group('lang', 'id')
 271         playlist = self._download_json(
 272             f'{self._API_BASE}/playlist/{lang}/{playlist_id}', playlist_id)['data']['attributes']
 273
 274         entries = [{
 275             '_type': 'url_transparent',
 276             'url': video['config']['url'],
 277             'ie_key': ArteTVIE.ie_key(),
 278             'id': video.get('providerId'),
 279             'title': video.get('title'),
 280             'alt_title': video.get('subtitle'),
 281             'thumbnail': url_or_none(traverse_obj(video, ('mainImage', 'url'))),
 282             'duration': int_or_none(traverse_obj(video, ('duration', 'seconds'))),
 283         } for video in traverse_obj(playlist, ('items', lambda _, v: v['config']['url']))]
 284
 285         return self.playlist_result(entries, playlist_id,
 286                                     traverse_obj(playlist, ('metadata', 'title')),
 287                                     traverse_obj(playlist, ('metadata', 'description')))
 288
 289
 290 class ArteTVCategoryIE(ArteTVBaseIE):
 291     _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE._ARTE_LANGUAGES
 292     _TESTS = [{
 293         'url': 'https://www.arte.tv/en/videos/politics-and-society/',
 294         'info_dict': {
 295             'id': 'politics-and-society',
 296             'title': 'Politics and society',
 297             'description': 'Investigative documentary series, geopolitical analysis, and international commentary',
 298         },
 299         'playlist_mincount': 13,
 300     }]
 301
 302     @classmethod
 303     def suitable(cls, url):
 304         return (
 305             not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, ))
 306             and super().suitable(url))
 307
 308     def _real_extract(self, url):
 309         lang, playlist_id = self._match_valid_url(url).groups()
 310         webpage = self._download_webpage(url, playlist_id)
 311
 312         items = []
 313         for video in re.finditer(
 314                 r'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|\b)(?P<url>https?://www\.arte\.tv/%s/videos/[\w/-]+)(?P=q)' % lang,
 315                 webpage):
 316             video = video.group('url')
 317             if video == url:
 318                 continue
 319             if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )):
 320                 items.append(video)
 321
 322         title = strip_or_none(self._generic_title('', webpage, default='').rsplit('|', 1)[0]) or None
 323
 324         return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title,
 325                                           description=self._og_search_description(webpage, default=None))