yt_dlp/extractor/arte.py

   1 import re
   2
   3 from .common import InfoExtractor
   4 from ..utils import (
   5     ExtractorError,
   6     GeoRestrictedError,
   7     int_or_none,
   8     parse_iso8601,
   9     parse_qs,
  10     strip_or_none,
  11     traverse_obj,
  12     url_or_none,
  13 )
  14
  15
  16 class ArteTVBaseIE(InfoExtractor):
  17     _ARTE_LANGUAGES = 'fr|de|en|es|it|pl'
  18     _API_BASE = 'https://api.arte.tv/api/player/v2'
  19
  20
  21 class ArteTVIE(ArteTVBaseIE):
  22     _VALID_URL = r'''(?x)
  23                     (?:https?://
  24                         (?:
  25                             (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos|
  26                             api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
  27                         )
  28                     |arte://program)
  29                         /(?P<id>\d{6}-\d{3}-[AF]|LIVE)
  30                     ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
  31     _TESTS = [{
  32         'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
  33         'only_matching': True,
  34     }, {
  35         'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/',
  36         'info_dict': {
  37             'id': '100103-000-A',
  38             'title': 'USA: Dyskryminacja na porodówce',
  39             'description': 'md5:242017b7cce59ffae340a54baefcafb1',
  40             'alt_title': 'ARTE Reportage',
  41             'upload_date': '20201103',
  42             'duration': 554,
  43             'thumbnail': r're:https://api-cdn\.arte\.tv/.+940x530',
  44             'timestamp': 1604417980,
  45             'ext': 'mp4',
  46         },
  47         'params': {'skip_download': 'm3u8'}
  48     }, {
  49         'note': 'No alt_title',
  50         'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/',
  51         'info_dict': {
  52             'id': '110371-000-A',
  53             'ext': 'mp4',
  54             'upload_date': '20220718',
  55             'duration': 154,
  56             'timestamp': 1658162460,
  57             'description': 'md5:5890f36fe7dccfadb8b7c0891de54786',
  58             'title': 'La chaleur, supplice des arbres de rue',
  59             'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/CPE2sQDtD8GLQgt8DuYHLf/940x530',
  60         },
  61         'params': {'skip_download': 'm3u8'}
  62     }, {
  63         'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
  64         'only_matching': True,
  65     }, {
  66         'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE',
  67         'only_matching': True,
  68     }]
  69
  70     _GEO_BYPASS = True
  71
  72     _LANG_MAP = {  # ISO639 -> French abbreviations
  73         'fr': 'F',
  74         'de': 'A',
  75         'en': 'E[ANG]',
  76         'es': 'E[ESP]',
  77         'it': 'E[ITA]',
  78         'pl': 'E[POL]',
  79         # XXX: probably means mixed; <https://www.arte.tv/en/videos/107710-029-A/dispatches-from-ukraine-local-journalists-report/>
  80         # uses this code for audio that happens to be in Ukrainian, but the manifest uses the ISO code 'mul' (mixed)
  81         'mul': 'EU',
  82     }
  83
  84     _VERSION_CODE_RE = re.compile(r'''(?x)
  85         V
  86         (?P<original_voice>O?)
  87         (?P<vlang>[FA]|E\[[A-Z]+\]|EU)?
  88         (?P<audio_desc>AUD|)
  89         (?:
  90             (?P<has_sub>-ST)
  91             (?P<sdh_sub>M?)
  92             (?P<sub_lang>[FA]|E\[[A-Z]+\]|EU)
  93         )?
  94     ''')
  95
  96     # all obtained by exhaustive testing
  97     _COUNTRIES_MAP = {
  98         'DE_FR': (
  99             'BL', 'DE', 'FR', 'GF', 'GP', 'MF', 'MQ', 'NC',
 100             'PF', 'PM', 'RE', 'WF', 'YT',
 101         ),
 102         # with both of the below 'BE' sometimes works, sometimes doesn't
 103         'EUR_DE_FR': (
 104             'AT', 'BL', 'CH', 'DE', 'FR', 'GF', 'GP', 'LI',
 105             'MC', 'MF', 'MQ', 'NC', 'PF', 'PM', 'RE', 'WF',
 106             'YT',
 107         ),
 108         'SAT': (
 109             'AD', 'AT', 'AX', 'BG', 'BL', 'CH', 'CY', 'CZ',
 110             'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GF',
 111             'GR', 'HR', 'HU', 'IE', 'IS', 'IT', 'KN', 'LI',
 112             'LT', 'LU', 'LV', 'MC', 'MF', 'MQ', 'MT', 'NC',
 113             'NL', 'NO', 'PF', 'PL', 'PM', 'PT', 'RE', 'RO',
 114             'SE', 'SI', 'SK', 'SM', 'VA', 'WF', 'YT',
 115         ),
 116     }
 117
 118     def _real_extract(self, url):
 119         mobj = self._match_valid_url(url)
 120         video_id = mobj.group('id')
 121         lang = mobj.group('lang') or mobj.group('lang_2')
 122         langauge_code = self._LANG_MAP.get(lang)
 123
 124         config = self._download_json(f'{self._API_BASE}/config/{lang}/{video_id}', video_id)
 125
 126         geoblocking = traverse_obj(config, ('data', 'attributes', 'restriction', 'geoblocking')) or {}
 127         if geoblocking.get('restrictedArea'):
 128             raise GeoRestrictedError(f'Video restricted to {geoblocking["code"]!r}',
 129                                      countries=self._COUNTRIES_MAP.get(geoblocking['code'], ('DE', 'FR')))
 130
 131         if not traverse_obj(config, ('data', 'attributes', 'rights')):
 132             # Eg: https://www.arte.tv/de/videos/097407-215-A/28-minuten
 133             # Eg: https://www.arte.tv/es/videos/104351-002-A/serviteur-du-peuple-1-23
 134             raise ExtractorError(
 135                 'Video is not available in this language edition of Arte or broadcast rights expired', expected=True)
 136
 137         formats, subtitles = [], {}
 138         secondary_formats = []
 139         for stream in config['data']['attributes']['streams']:
 140             # official player contains code like `e.get("versions")[0].eStat.ml5`
 141             stream_version = stream['versions'][0]
 142             stream_version_code = stream_version['eStat']['ml5']
 143
 144             lang_pref = -1
 145             m = self._VERSION_CODE_RE.match(stream_version_code)
 146             if m:
 147                 lang_pref = int(''.join('01'[x] for x in (
 148                     m.group('vlang') == langauge_code,      # we prefer voice in the requested language
 149                     not m.group('audio_desc'),              # and not the audio description version
 150                     bool(m.group('original_voice')),        # but if voice is not in the requested language, at least choose the original voice
 151                     m.group('sub_lang') == langauge_code,   # if subtitles are present, we prefer them in the requested language
 152                     not m.group('has_sub'),                 # but we prefer no subtitles otherwise
 153                     not m.group('sdh_sub'),                 # and we prefer not the hard-of-hearing subtitles if there are subtitles
 154                 )))
 155
 156             short_label = traverse_obj(stream_version, 'shortLabel', expected_type=str, default='?')
 157             if stream['protocol'].startswith('HLS'):
 158                 fmts, subs = self._extract_m3u8_formats_and_subtitles(
 159                     stream['url'], video_id=video_id, ext='mp4', m3u8_id=stream_version_code, fatal=False)
 160                 for fmt in fmts:
 161                     fmt.update({
 162                         'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
 163                         'language_preference': lang_pref,
 164                     })
 165                 if any(map(short_label.startswith, ('cc', 'OGsub'))):
 166                     secondary_formats.extend(fmts)
 167                 else:
 168                     formats.extend(fmts)
 169                 self._merge_subtitles(subs, target=subtitles)
 170
 171             elif stream['protocol'] in ('HTTPS', 'RTMP'):
 172                 formats.append({
 173                     'format_id': f'{stream["protocol"]}-{stream_version_code}',
 174                     'url': stream['url'],
 175                     'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
 176                     'language_preference': lang_pref,
 177                     # 'ext': 'mp4',  # XXX: may or may not be necessary, at least for HTTPS
 178                 })
 179
 180             else:
 181                 self.report_warning(f'Skipping stream with unknown protocol {stream["protocol"]}')
 182
 183             # TODO: chapters from stream['segments']?
 184             # The JS also looks for chapters in config['data']['attributes']['chapters'],
 185             # but I am yet to find a video having those
 186
 187         formats.extend(secondary_formats)
 188         self._remove_duplicate_formats(formats)
 189         self._sort_formats(formats)
 190
 191         metadata = config['data']['attributes']['metadata']
 192
 193         return {
 194             'id': metadata['providerId'],
 195             'webpage_url': traverse_obj(metadata, ('link', 'url')),
 196             'title': traverse_obj(metadata, 'subtitle', 'title'),
 197             'alt_title': metadata.get('subtitle') and metadata.get('title'),
 198             'description': metadata.get('description'),
 199             'duration': traverse_obj(metadata, ('duration', 'seconds')),
 200             'language': metadata.get('language'),
 201             'timestamp': traverse_obj(config, ('data', 'attributes', 'rights', 'begin'), expected_type=parse_iso8601),
 202             'is_live': config['data']['attributes'].get('live', False),
 203             'formats': formats,
 204             'subtitles': subtitles,
 205             'thumbnails': [
 206                 {'url': image['url'], 'id': image.get('caption')}
 207                 for image in metadata.get('images') or [] if url_or_none(image.get('url'))
 208             ],
 209         }
 210
 211
 212 class ArteTVEmbedIE(InfoExtractor):
 213     _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
 214     _EMBED_REGEX = [r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1']
 215     _TESTS = [{
 216         'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
 217         'info_dict': {
 218             'id': '100605-013-A',
 219             'ext': 'mp4',
 220             'title': 'United we Stream November Lockdown Edition #13',
 221             'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
 222             'upload_date': '20201116',
 223         },
 224         'skip': 'No video available'
 225     }, {
 226         'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
 227         'only_matching': True,
 228     }]
 229
 230     def _real_extract(self, url):
 231         qs = parse_qs(url)
 232         json_url = qs['json_url'][0]
 233         video_id = ArteTVIE._match_id(json_url)
 234         return self.url_result(
 235             json_url, ie=ArteTVIE.ie_key(), video_id=video_id)
 236
 237
 238 class ArteTVPlaylistIE(ArteTVBaseIE):
 239     _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES
 240     _TESTS = [{
 241         'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
 242         'only_matching': True,
 243     }, {
 244         'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
 245         'playlist_mincount': 100,
 246         'info_dict': {
 247             'description': 'md5:84e7bf1feda248bc325ebfac818c476e',
 248             'id': 'RC-014123',
 249             'title': 'ARTE Reportage - najlepsze reportaże',
 250         },
 251     }]
 252
 253     def _real_extract(self, url):
 254         lang, playlist_id = self._match_valid_url(url).group('lang', 'id')
 255         playlist = self._download_json(
 256             f'{self._API_BASE}/playlist/{lang}/{playlist_id}', playlist_id)['data']['attributes']
 257
 258         entries = [{
 259             '_type': 'url_transparent',
 260             'url': video['config']['url'],
 261             'ie_key': ArteTVIE.ie_key(),
 262             'id': video.get('providerId'),
 263             'title': video.get('title'),
 264             'alt_title': video.get('subtitle'),
 265             'thumbnail': url_or_none(traverse_obj(video, ('mainImage', 'url'))),
 266             'duration': int_or_none(traverse_obj(video, ('duration', 'seconds'))),
 267         } for video in traverse_obj(playlist, ('items', lambda _, v: v['config']['url']))]
 268
 269         return self.playlist_result(entries, playlist_id,
 270                                     traverse_obj(playlist, ('metadata', 'title')),
 271                                     traverse_obj(playlist, ('metadata', 'description')))
 272
 273
 274 class ArteTVCategoryIE(ArteTVBaseIE):
 275     _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE._ARTE_LANGUAGES
 276     _TESTS = [{
 277         'url': 'https://www.arte.tv/en/videos/politics-and-society/',
 278         'info_dict': {
 279             'id': 'politics-and-society',
 280             'title': 'Politics and society',
 281             'description': 'Investigative documentary series, geopolitical analysis, and international commentary',
 282         },
 283         'playlist_mincount': 13,
 284     }]
 285
 286     @classmethod
 287     def suitable(cls, url):
 288         return (
 289             not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, ))
 290             and super().suitable(url))
 291
 292     def _real_extract(self, url):
 293         lang, playlist_id = self._match_valid_url(url).groups()
 294         webpage = self._download_webpage(url, playlist_id)
 295
 296         items = []
 297         for video in re.finditer(
 298                 r'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|\b)(?P<url>https?://www\.arte\.tv/%s/videos/[\w/-]+)(?P=q)' % lang,
 299                 webpage):
 300             video = video.group('url')
 301             if video == url:
 302                 continue
 303             if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )):
 304                 items.append(video)
 305
 306         title = strip_or_none(self._generic_title('', webpage, default='').rsplit('|', 1)[0]) or None
 307
 308         return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title,
 309                                           description=self._og_search_description(webpage, default=None))