yt_dlp/extractor/arte.py

   1 import re
   2
   3 from .common import InfoExtractor
   4 from ..utils import (
   5     ExtractorError,
   6     GeoRestrictedError,
   7     int_or_none,
   8     parse_iso8601,
   9     parse_qs,
  10     strip_or_none,
  11     traverse_obj,
  12     url_or_none,
  13 )
  14
  15
  16 class ArteTVBaseIE(InfoExtractor):
  17     _ARTE_LANGUAGES = 'fr|de|en|es|it|pl'
  18     _API_BASE = 'https://api.arte.tv/api/player/v2'
  19
  20
  21 class ArteTVIE(ArteTVBaseIE):
  22     _VALID_URL = r'''(?x)
  23                     (?:https?://
  24                         (?:
  25                             (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos|
  26                             api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
  27                         )
  28                     |arte://program)
  29                         /(?P<id>\d{6}-\d{3}-[AF]|LIVE)
  30                     ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
  31     _TESTS = [{
  32         'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
  33         'only_matching': True,
  34     }, {
  35         'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/',
  36         'info_dict': {
  37             'id': '100103-000-A',
  38             'title': 'USA: Dyskryminacja na porodówce',
  39             'description': 'md5:242017b7cce59ffae340a54baefcafb1',
  40             'alt_title': 'ARTE Reportage',
  41             'upload_date': '20201103',
  42             'duration': 554,
  43             'thumbnail': r're:https://api-cdn\.arte\.tv/.+940x530',
  44             'timestamp': 1604417980,
  45             'ext': 'mp4',
  46         },
  47         'params': {'skip_download': 'm3u8'}
  48     }, {
  49         'note': 'No alt_title',
  50         'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/',
  51         'info_dict': {
  52             'id': '110371-000-A',
  53             'ext': 'mp4',
  54             'upload_date': '20220718',
  55             'duration': 154,
  56             'timestamp': 1658162460,
  57             'description': 'md5:5890f36fe7dccfadb8b7c0891de54786',
  58             'title': 'La chaleur, supplice des arbres de rue',
  59             'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/CPE2sQDtD8GLQgt8DuYHLf/940x530',
  60         },
  61         'params': {'skip_download': 'm3u8'}
  62     }, {
  63         'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
  64         'only_matching': True,
  65     }, {
  66         'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE',
  67         'only_matching': True,
  68     }]
  69
  70     _GEO_BYPASS = True
  71
  72     _LANG_MAP = {  # ISO639 -> French abbreviations
  73         'fr': 'F',
  74         'de': 'A',
  75         'en': 'E[ANG]',
  76         'es': 'E[ESP]',
  77         'it': 'E[ITA]',
  78         'pl': 'E[POL]',
  79         # XXX: probably means mixed; <https://www.arte.tv/en/videos/107710-029-A/dispatches-from-ukraine-local-journalists-report/>
  80         # uses this code for audio that happens to be in Ukrainian, but the manifest uses the ISO code 'mul' (mixed)
  81         'mul': 'EU',
  82     }
  83
  84     _VERSION_CODE_RE = re.compile(r'''(?x)
  85         V
  86         (?P<original_voice>O?)
  87         (?P<vlang>[FA]|E\[[A-Z]+\]|EU)?
  88         (?P<audio_desc>AUD|)
  89         (?:
  90             (?P<has_sub>-ST)
  91             (?P<sdh_sub>M?)
  92             (?P<sub_lang>[FA]|E\[[A-Z]+\]|EU)
  93         )?
  94     ''')
  95
  96     # all obtained by exhaustive testing
  97     _COUNTRIES_MAP = {
  98         'DE_FR': (
  99             'BL', 'DE', 'FR', 'GF', 'GP', 'MF', 'MQ', 'NC',
 100             'PF', 'PM', 'RE', 'WF', 'YT',
 101         ),
 102         # with both of the below 'BE' sometimes works, sometimes doesn't
 103         'EUR_DE_FR': (
 104             'AT', 'BL', 'CH', 'DE', 'FR', 'GF', 'GP', 'LI',
 105             'MC', 'MF', 'MQ', 'NC', 'PF', 'PM', 'RE', 'WF',
 106             'YT',
 107         ),
 108         'SAT': (
 109             'AD', 'AT', 'AX', 'BG', 'BL', 'CH', 'CY', 'CZ',
 110             'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GF',
 111             'GR', 'HR', 'HU', 'IE', 'IS', 'IT', 'KN', 'LI',
 112             'LT', 'LU', 'LV', 'MC', 'MF', 'MQ', 'MT', 'NC',
 113             'NL', 'NO', 'PF', 'PL', 'PM', 'PT', 'RE', 'RO',
 114             'SE', 'SI', 'SK', 'SM', 'VA', 'WF', 'YT',
 115         ),
 116     }
 117
 118     def _real_extract(self, url):
 119         mobj = self._match_valid_url(url)
 120         video_id = mobj.group('id')
 121         lang = mobj.group('lang') or mobj.group('lang_2')
 122         langauge_code = self._LANG_MAP.get(lang)
 123
 124         config = self._download_json(f'{self._API_BASE}/config/{lang}/{video_id}', video_id)
 125
 126         geoblocking = traverse_obj(config, ('data', 'attributes', 'restriction', 'geoblocking')) or {}
 127         if geoblocking.get('restrictedArea'):
 128             raise GeoRestrictedError(f'Video restricted to {geoblocking["code"]!r}',
 129                                      countries=self._COUNTRIES_MAP.get(geoblocking['code'], ('DE', 'FR')))
 130
 131         if not traverse_obj(config, ('data', 'attributes', 'rights')):
 132             # Eg: https://www.arte.tv/de/videos/097407-215-A/28-minuten
 133             # Eg: https://www.arte.tv/es/videos/104351-002-A/serviteur-du-peuple-1-23
 134             raise ExtractorError(
 135                 'Video is not available in this language edition of Arte or broadcast rights expired', expected=True)
 136
 137         formats, subtitles = [], {}
 138         secondary_formats = []
 139         for stream in config['data']['attributes']['streams']:
 140             # official player contains code like `e.get("versions")[0].eStat.ml5`
 141             stream_version = stream['versions'][0]
 142             stream_version_code = stream_version['eStat']['ml5']
 143
 144             lang_pref = -1
 145             m = self._VERSION_CODE_RE.match(stream_version_code)
 146             if m:
 147                 lang_pref = int(''.join('01'[x] for x in (
 148                     m.group('vlang') == langauge_code,      # we prefer voice in the requested language
 149                     not m.group('audio_desc'),              # and not the audio description version
 150                     bool(m.group('original_voice')),        # but if voice is not in the requested language, at least choose the original voice
 151                     m.group('sub_lang') == langauge_code,   # if subtitles are present, we prefer them in the requested language
 152                     not m.group('has_sub'),                 # but we prefer no subtitles otherwise
 153                     not m.group('sdh_sub'),                 # and we prefer not the hard-of-hearing subtitles if there are subtitles
 154                 )))
 155
 156             short_label = traverse_obj(stream_version, 'shortLabel', expected_type=str, default='?')
 157             if stream['protocol'].startswith('HLS'):
 158                 fmts, subs = self._extract_m3u8_formats_and_subtitles(
 159                     stream['url'], video_id=video_id, ext='mp4', m3u8_id=stream_version_code, fatal=False)
 160                 for fmt in fmts:
 161                     fmt.update({
 162                         'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
 163                         'language_preference': lang_pref,
 164                     })
 165                 if any(map(short_label.startswith, ('cc', 'OGsub'))):
 166                     secondary_formats.extend(fmts)
 167                 else:
 168                     formats.extend(fmts)
 169                 self._merge_subtitles(subs, target=subtitles)
 170
 171             elif stream['protocol'] in ('HTTPS', 'RTMP'):
 172                 formats.append({
 173                     'format_id': f'{stream["protocol"]}-{stream_version_code}',
 174                     'url': stream['url'],
 175                     'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
 176                     'language_preference': lang_pref,
 177                     # 'ext': 'mp4',  # XXX: may or may not be necessary, at least for HTTPS
 178                 })
 179
 180             else:
 181                 self.report_warning(f'Skipping stream with unknown protocol {stream["protocol"]}')
 182
 183             # TODO: chapters from stream['segments']?
 184             # The JS also looks for chapters in config['data']['attributes']['chapters'],
 185             # but I am yet to find a video having those
 186
 187         formats.extend(secondary_formats)
 188         self._remove_duplicate_formats(formats)
 189
 190         metadata = config['data']['attributes']['metadata']
 191
 192         return {
 193             'id': metadata['providerId'],
 194             'webpage_url': traverse_obj(metadata, ('link', 'url')),
 195             'title': traverse_obj(metadata, 'subtitle', 'title'),
 196             'alt_title': metadata.get('subtitle') and metadata.get('title'),
 197             'description': metadata.get('description'),
 198             'duration': traverse_obj(metadata, ('duration', 'seconds')),
 199             'language': metadata.get('language'),
 200             'timestamp': traverse_obj(config, ('data', 'attributes', 'rights', 'begin'), expected_type=parse_iso8601),
 201             'is_live': config['data']['attributes'].get('live', False),
 202             'formats': formats,
 203             'subtitles': subtitles,
 204             'thumbnails': [
 205                 {'url': image['url'], 'id': image.get('caption')}
 206                 for image in metadata.get('images') or [] if url_or_none(image.get('url'))
 207             ],
 208         }
 209
 210
 211 class ArteTVEmbedIE(InfoExtractor):
 212     _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
 213     _EMBED_REGEX = [r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1']
 214     _TESTS = [{
 215         'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
 216         'info_dict': {
 217             'id': '100605-013-A',
 218             'ext': 'mp4',
 219             'title': 'United we Stream November Lockdown Edition #13',
 220             'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
 221             'upload_date': '20201116',
 222         },
 223         'skip': 'No video available'
 224     }, {
 225         'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
 226         'only_matching': True,
 227     }]
 228
 229     def _real_extract(self, url):
 230         qs = parse_qs(url)
 231         json_url = qs['json_url'][0]
 232         video_id = ArteTVIE._match_id(json_url)
 233         return self.url_result(
 234             json_url, ie=ArteTVIE.ie_key(), video_id=video_id)
 235
 236
 237 class ArteTVPlaylistIE(ArteTVBaseIE):
 238     _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES
 239     _TESTS = [{
 240         'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
 241         'only_matching': True,
 242     }, {
 243         'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
 244         'playlist_mincount': 100,
 245         'info_dict': {
 246             'description': 'md5:84e7bf1feda248bc325ebfac818c476e',
 247             'id': 'RC-014123',
 248             'title': 'ARTE Reportage - najlepsze reportaże',
 249         },
 250     }]
 251
 252     def _real_extract(self, url):
 253         lang, playlist_id = self._match_valid_url(url).group('lang', 'id')
 254         playlist = self._download_json(
 255             f'{self._API_BASE}/playlist/{lang}/{playlist_id}', playlist_id)['data']['attributes']
 256
 257         entries = [{
 258             '_type': 'url_transparent',
 259             'url': video['config']['url'],
 260             'ie_key': ArteTVIE.ie_key(),
 261             'id': video.get('providerId'),
 262             'title': video.get('title'),
 263             'alt_title': video.get('subtitle'),
 264             'thumbnail': url_or_none(traverse_obj(video, ('mainImage', 'url'))),
 265             'duration': int_or_none(traverse_obj(video, ('duration', 'seconds'))),
 266         } for video in traverse_obj(playlist, ('items', lambda _, v: v['config']['url']))]
 267
 268         return self.playlist_result(entries, playlist_id,
 269                                     traverse_obj(playlist, ('metadata', 'title')),
 270                                     traverse_obj(playlist, ('metadata', 'description')))
 271
 272
 273 class ArteTVCategoryIE(ArteTVBaseIE):
 274     _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE._ARTE_LANGUAGES
 275     _TESTS = [{
 276         'url': 'https://www.arte.tv/en/videos/politics-and-society/',
 277         'info_dict': {
 278             'id': 'politics-and-society',
 279             'title': 'Politics and society',
 280             'description': 'Investigative documentary series, geopolitical analysis, and international commentary',
 281         },
 282         'playlist_mincount': 13,
 283     }]
 284
 285     @classmethod
 286     def suitable(cls, url):
 287         return (
 288             not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, ))
 289             and super().suitable(url))
 290
 291     def _real_extract(self, url):
 292         lang, playlist_id = self._match_valid_url(url).groups()
 293         webpage = self._download_webpage(url, playlist_id)
 294
 295         items = []
 296         for video in re.finditer(
 297                 r'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|\b)(?P<url>https?://www\.arte\.tv/%s/videos/[\w/-]+)(?P=q)' % lang,
 298                 webpage):
 299             video = video.group('url')
 300             if video == url:
 301                 continue
 302             if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )):
 303                 items.append(video)
 304
 305         title = strip_or_none(self._generic_title('', webpage, default='').rsplit('|', 1)[0]) or None
 306
 307         return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title,
 308                                           description=self._og_search_description(webpage, default=None))