yt_dlp/extractor/arte.py

   1 import re
   2
   3 from .common import InfoExtractor
   4 from ..utils import (
   5     ExtractorError,
   6     GeoRestrictedError,
   7     int_or_none,
   8     parse_iso8601,
   9     parse_qs,
  10     strip_or_none,
  11     traverse_obj,
  12     url_or_none,
  13 )
  14
  15
  16 class ArteTVBaseIE(InfoExtractor):
  17     _ARTE_LANGUAGES = 'fr|de|en|es|it|pl'
  18     _API_BASE = 'https://api.arte.tv/api/player/v2'
  19
  20
  21 class ArteTVIE(ArteTVBaseIE):
  22     _VALID_URL = r'''(?x)
  23                     (?:https?://
  24                         (?:
  25                             (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos|
  26                             api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
  27                         )
  28                     |arte://program)
  29                         /(?P<id>\d{6}-\d{3}-[AF]|LIVE)
  30                     ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
  31     _TESTS = [{
  32         'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
  33         'only_matching': True,
  34     }, {
  35         'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/',
  36         'info_dict': {
  37             'id': '100103-000-A',
  38             'title': 'USA: Dyskryminacja na porodówce',
  39             'description': 'md5:242017b7cce59ffae340a54baefcafb1',
  40             'alt_title': 'ARTE Reportage',
  41             'upload_date': '20201103',
  42             'duration': 554,
  43             'thumbnail': r're:https://api-cdn\.arte\.tv/.+940x530',
  44             'timestamp': 1604417980,
  45             'ext': 'mp4',
  46         },
  47         'params': {'skip_download': 'm3u8'}
  48     }, {
  49         'note': 'No alt_title',
  50         'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/',
  51         'info_dict': {
  52             'id': '110371-000-A',
  53             'ext': 'mp4',
  54             'upload_date': '20220718',
  55             'duration': 154,
  56             'timestamp': 1658162460,
  57             'description': 'md5:5890f36fe7dccfadb8b7c0891de54786',
  58             'title': 'La chaleur, supplice des arbres de rue',
  59             'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/CPE2sQDtD8GLQgt8DuYHLf/940x530',
  60         },
  61         'params': {'skip_download': 'm3u8'}
  62     }, {
  63         'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
  64         'only_matching': True,
  65     }, {
  66         'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE',
  67         'only_matching': True,
  68     }, {
  69         'url': 'https://www.arte.tv/de/videos/110203-006-A/zaz/',
  70         'info_dict': {
  71             'id': '110203-006-A',
  72             'chapters': 'count:16',
  73             'description': 'md5:cf592f1df52fe52007e3f8eac813c084',
  74             'alt_title': 'Zaz',
  75             'title': 'Baloise Session 2022',
  76             'timestamp': 1668445200,
  77             'duration': 4054,
  78             'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/ubQjmVCGyRx3hmBuZEK9QZ/940x530',
  79             'upload_date': '20221114',
  80             'ext': 'mp4',
  81         },
  82         'expected_warnings': ['geo restricted']
  83     }]
  84
  85     _GEO_BYPASS = True
  86
  87     _LANG_MAP = {  # ISO639 -> French abbreviations
  88         'fr': 'F',
  89         'de': 'A',
  90         'en': 'E[ANG]',
  91         'es': 'E[ESP]',
  92         'it': 'E[ITA]',
  93         'pl': 'E[POL]',
  94         # XXX: probably means mixed; <https://www.arte.tv/en/videos/107710-029-A/dispatches-from-ukraine-local-journalists-report/>
  95         # uses this code for audio that happens to be in Ukrainian, but the manifest uses the ISO code 'mul' (mixed)
  96         'mul': 'EU',
  97     }
  98
  99     _VERSION_CODE_RE = re.compile(r'''(?x)
 100         V
 101         (?P<original_voice>O?)
 102         (?P<vlang>[FA]|E\[[A-Z]+\]|EU)?
 103         (?P<audio_desc>AUD|)
 104         (?:
 105             (?P<has_sub>-ST)
 106             (?P<sdh_sub>M?)
 107             (?P<sub_lang>[FA]|E\[[A-Z]+\]|EU)
 108         )?
 109     ''')
 110
 111     # all obtained by exhaustive testing
 112     _COUNTRIES_MAP = {
 113         'DE_FR': (
 114             'BL', 'DE', 'FR', 'GF', 'GP', 'MF', 'MQ', 'NC',
 115             'PF', 'PM', 'RE', 'WF', 'YT',
 116         ),
 117         # with both of the below 'BE' sometimes works, sometimes doesn't
 118         'EUR_DE_FR': (
 119             'AT', 'BL', 'CH', 'DE', 'FR', 'GF', 'GP', 'LI',
 120             'MC', 'MF', 'MQ', 'NC', 'PF', 'PM', 'RE', 'WF',
 121             'YT',
 122         ),
 123         'SAT': (
 124             'AD', 'AT', 'AX', 'BG', 'BL', 'CH', 'CY', 'CZ',
 125             'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GF',
 126             'GR', 'HR', 'HU', 'IE', 'IS', 'IT', 'KN', 'LI',
 127             'LT', 'LU', 'LV', 'MC', 'MF', 'MQ', 'MT', 'NC',
 128             'NL', 'NO', 'PF', 'PL', 'PM', 'PT', 'RE', 'RO',
 129             'SE', 'SI', 'SK', 'SM', 'VA', 'WF', 'YT',
 130         ),
 131     }
 132
 133     def _real_extract(self, url):
 134         mobj = self._match_valid_url(url)
 135         video_id = mobj.group('id')
 136         lang = mobj.group('lang') or mobj.group('lang_2')
 137         langauge_code = self._LANG_MAP.get(lang)
 138
 139         config = self._download_json(f'{self._API_BASE}/config/{lang}/{video_id}', video_id)
 140
 141         geoblocking = traverse_obj(config, ('data', 'attributes', 'restriction', 'geoblocking')) or {}
 142         if geoblocking.get('restrictedArea'):
 143             raise GeoRestrictedError(f'Video restricted to {geoblocking["code"]!r}',
 144                                      countries=self._COUNTRIES_MAP.get(geoblocking['code'], ('DE', 'FR')))
 145
 146         if not traverse_obj(config, ('data', 'attributes', 'rights')):
 147             # Eg: https://www.arte.tv/de/videos/097407-215-A/28-minuten
 148             # Eg: https://www.arte.tv/es/videos/104351-002-A/serviteur-du-peuple-1-23
 149             raise ExtractorError(
 150                 'Video is not available in this language edition of Arte or broadcast rights expired', expected=True)
 151
 152         formats, subtitles = [], {}
 153         secondary_formats = []
 154         for stream in config['data']['attributes']['streams']:
 155             # official player contains code like `e.get("versions")[0].eStat.ml5`
 156             stream_version = stream['versions'][0]
 157             stream_version_code = stream_version['eStat']['ml5']
 158
 159             lang_pref = -1
 160             m = self._VERSION_CODE_RE.match(stream_version_code)
 161             if m:
 162                 lang_pref = int(''.join('01'[x] for x in (
 163                     m.group('vlang') == langauge_code,      # we prefer voice in the requested language
 164                     not m.group('audio_desc'),              # and not the audio description version
 165                     bool(m.group('original_voice')),        # but if voice is not in the requested language, at least choose the original voice
 166                     m.group('sub_lang') == langauge_code,   # if subtitles are present, we prefer them in the requested language
 167                     not m.group('has_sub'),                 # but we prefer no subtitles otherwise
 168                     not m.group('sdh_sub'),                 # and we prefer not the hard-of-hearing subtitles if there are subtitles
 169                 )))
 170
 171             short_label = traverse_obj(stream_version, 'shortLabel', expected_type=str, default='?')
 172             if stream['protocol'].startswith('HLS'):
 173                 fmts, subs = self._extract_m3u8_formats_and_subtitles(
 174                     stream['url'], video_id=video_id, ext='mp4', m3u8_id=stream_version_code, fatal=False)
 175                 for fmt in fmts:
 176                     fmt.update({
 177                         'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
 178                         'language_preference': lang_pref,
 179                     })
 180                 if any(map(short_label.startswith, ('cc', 'OGsub'))):
 181                     secondary_formats.extend(fmts)
 182                 else:
 183                     formats.extend(fmts)
 184                 self._merge_subtitles(subs, target=subtitles)
 185
 186             elif stream['protocol'] in ('HTTPS', 'RTMP'):
 187                 formats.append({
 188                     'format_id': f'{stream["protocol"]}-{stream_version_code}',
 189                     'url': stream['url'],
 190                     'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
 191                     'language_preference': lang_pref,
 192                     # 'ext': 'mp4',  # XXX: may or may not be necessary, at least for HTTPS
 193                 })
 194
 195             else:
 196                 self.report_warning(f'Skipping stream with unknown protocol {stream["protocol"]}')
 197
 198         formats.extend(secondary_formats)
 199         self._remove_duplicate_formats(formats)
 200
 201         metadata = config['data']['attributes']['metadata']
 202
 203         return {
 204             'id': metadata['providerId'],
 205             'webpage_url': traverse_obj(metadata, ('link', 'url')),
 206             'title': traverse_obj(metadata, 'subtitle', 'title'),
 207             'alt_title': metadata.get('subtitle') and metadata.get('title'),
 208             'description': metadata.get('description'),
 209             'duration': traverse_obj(metadata, ('duration', 'seconds')),
 210             'language': metadata.get('language'),
 211             'timestamp': traverse_obj(config, ('data', 'attributes', 'rights', 'begin'), expected_type=parse_iso8601),
 212             'is_live': config['data']['attributes'].get('live', False),
 213             'formats': formats,
 214             'subtitles': subtitles,
 215             'thumbnails': [
 216                 {'url': image['url'], 'id': image.get('caption')}
 217                 for image in metadata.get('images') or [] if url_or_none(image.get('url'))
 218             ],
 219             # TODO: chapters may also be in stream['segments']?
 220             'chapters': traverse_obj(config, ('data', 'attributes', 'chapters', 'elements', ..., {
 221                 'start_time': 'startTime',
 222                 'title': 'title',
 223             })) or None,
 224         }
 225
 226
 227 class ArteTVEmbedIE(InfoExtractor):
 228     _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
 229     _EMBED_REGEX = [r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1']
 230     _TESTS = [{
 231         'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
 232         'info_dict': {
 233             'id': '100605-013-A',
 234             'ext': 'mp4',
 235             'title': 'United we Stream November Lockdown Edition #13',
 236             'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
 237             'upload_date': '20201116',
 238         },
 239         'skip': 'No video available'
 240     }, {
 241         'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
 242         'only_matching': True,
 243     }]
 244
 245     def _real_extract(self, url):
 246         qs = parse_qs(url)
 247         json_url = qs['json_url'][0]
 248         video_id = ArteTVIE._match_id(json_url)
 249         return self.url_result(
 250             json_url, ie=ArteTVIE.ie_key(), video_id=video_id)
 251
 252
 253 class ArteTVPlaylistIE(ArteTVBaseIE):
 254     _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES
 255     _TESTS = [{
 256         'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
 257         'only_matching': True,
 258     }, {
 259         'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
 260         'playlist_mincount': 100,
 261         'info_dict': {
 262             'description': 'md5:84e7bf1feda248bc325ebfac818c476e',
 263             'id': 'RC-014123',
 264             'title': 'ARTE Reportage - najlepsze reportaże',
 265         },
 266     }]
 267
 268     def _real_extract(self, url):
 269         lang, playlist_id = self._match_valid_url(url).group('lang', 'id')
 270         playlist = self._download_json(
 271             f'{self._API_BASE}/playlist/{lang}/{playlist_id}', playlist_id)['data']['attributes']
 272
 273         entries = [{
 274             '_type': 'url_transparent',
 275             'url': video['config']['url'],
 276             'ie_key': ArteTVIE.ie_key(),
 277             'id': video.get('providerId'),
 278             'title': video.get('title'),
 279             'alt_title': video.get('subtitle'),
 280             'thumbnail': url_or_none(traverse_obj(video, ('mainImage', 'url'))),
 281             'duration': int_or_none(traverse_obj(video, ('duration', 'seconds'))),
 282         } for video in traverse_obj(playlist, ('items', lambda _, v: v['config']['url']))]
 283
 284         return self.playlist_result(entries, playlist_id,
 285                                     traverse_obj(playlist, ('metadata', 'title')),
 286                                     traverse_obj(playlist, ('metadata', 'description')))
 287
 288
 289 class ArteTVCategoryIE(ArteTVBaseIE):
 290     _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE._ARTE_LANGUAGES
 291     _TESTS = [{
 292         'url': 'https://www.arte.tv/en/videos/politics-and-society/',
 293         'info_dict': {
 294             'id': 'politics-and-society',
 295             'title': 'Politics and society',
 296             'description': 'Investigative documentary series, geopolitical analysis, and international commentary',
 297         },
 298         'playlist_mincount': 13,
 299     }]
 300
 301     @classmethod
 302     def suitable(cls, url):
 303         return (
 304             not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, ))
 305             and super().suitable(url))
 306
 307     def _real_extract(self, url):
 308         lang, playlist_id = self._match_valid_url(url).groups()
 309         webpage = self._download_webpage(url, playlist_id)
 310
 311         items = []
 312         for video in re.finditer(
 313                 r'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|\b)(?P<url>https?://www\.arte\.tv/%s/videos/[\w/-]+)(?P=q)' % lang,
 314                 webpage):
 315             video = video.group('url')
 316             if video == url:
 317                 continue
 318             if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )):
 319                 items.append(video)
 320
 321         title = strip_or_none(self._generic_title('', webpage, default='').rsplit('|', 1)[0]) or None
 322
 323         return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title,
 324                                           description=self._og_search_description(webpage, default=None))