yt_dlp/extractor/arte.py

   1 import re
   2
   3 from .common import InfoExtractor
   4 from ..utils import (
   5     ExtractorError,
   6     GeoRestrictedError,
   7     int_or_none,
   8     parse_iso8601,
   9     parse_qs,
  10     strip_or_none,
  11     traverse_obj,
  12     url_or_none,
  13 )
  14
  15
  16 class ArteTVBaseIE(InfoExtractor):
  17     _ARTE_LANGUAGES = 'fr|de|en|es|it|pl'
  18     _API_BASE = 'https://api.arte.tv/api/player/v2'
  19
  20
  21 class ArteTVIE(ArteTVBaseIE):
  22     _VALID_URL = r'''(?x)
  23                     (?:https?://
  24                         (?:
  25                             (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos|
  26                             api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
  27                         )
  28                     |arte://program)
  29                         /(?P<id>\d{6}-\d{3}-[AF]|LIVE)
  30                     ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
  31     _TESTS = [{
  32         'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
  33         'only_matching': True,
  34     }, {
  35         'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/',
  36         'info_dict': {
  37             'id': '100103-000-A',
  38             'title': 'USA: Dyskryminacja na porodówce',
  39             'description': 'md5:242017b7cce59ffae340a54baefcafb1',
  40             'alt_title': 'ARTE Reportage',
  41             'upload_date': '20201103',
  42             'duration': 554,
  43             'thumbnail': r're:https://api-cdn\.arte\.tv/.+940x530',
  44             'timestamp': 1604417980,
  45             'ext': 'mp4',
  46         },
  47         'params': {'skip_download': 'm3u8'}
  48     }, {
  49         'note': 'No alt_title',
  50         'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/',
  51         'info_dict': {
  52             'id': '110371-000-A',
  53             'ext': 'mp4',
  54             'upload_date': '20220718',
  55             'duration': 154,
  56             'timestamp': 1658162460,
  57             'description': 'md5:5890f36fe7dccfadb8b7c0891de54786',
  58             'title': 'La chaleur, supplice des arbres de rue',
  59             'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/CPE2sQDtD8GLQgt8DuYHLf/940x530',
  60         },
  61         'params': {'skip_download': 'm3u8'}
  62     }, {
  63         'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
  64         'only_matching': True,
  65     }, {
  66         'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE',
  67         'only_matching': True,
  68     }]
  69
  70     _GEO_BYPASS = True
  71
  72     _LANG_MAP = {  # ISO639 -> French abbreviations
  73         'fr': 'F',
  74         'de': 'A',
  75         'en': 'E[ANG]',
  76         'es': 'E[ESP]',
  77         'it': 'E[ITA]',
  78         'pl': 'E[POL]',
  79         # XXX: probably means mixed; <https://www.arte.tv/en/videos/107710-029-A/dispatches-from-ukraine-local-journalists-report/>
  80         # uses this code for audio that happens to be in Ukrainian, but the manifest uses the ISO code 'mul' (mixed)
  81         'mul': 'EU',
  82     }
  83
  84     _VERSION_CODE_RE = re.compile(r'''(?x)
  85         V
  86         (?P<original_voice>O?)
  87         (?P<vlang>[FA]|E\[[A-Z]+\]|EU)?
  88         (?P<audio_desc>AUD|)
  89         (?:
  90             (?P<has_sub>-ST)
  91             (?P<sdh_sub>M?)
  92             (?P<sub_lang>[FA]|E\[[A-Z]+\]|EU)
  93         )?
  94     ''')
  95
  96     # all obtained by exhaustive testing
  97     _COUNTRIES_MAP = {
  98         'DE_FR': {
  99             'BL', 'DE', 'FR', 'GF', 'GP', 'MF', 'MQ', 'NC',
 100             'PF', 'PM', 'RE', 'WF', 'YT',
 101         },
 102         # with both of the below 'BE' sometimes works, sometimes doesn't
 103         'EUR_DE_FR': {
 104             'AT', 'BL', 'CH', 'DE', 'FR', 'GF', 'GP', 'LI',
 105             'MC', 'MF', 'MQ', 'NC', 'PF', 'PM', 'RE', 'WF',
 106             'YT',
 107         },
 108         'SAT': {
 109             'AD', 'AT', 'AX', 'BG', 'BL', 'CH', 'CY', 'CZ',
 110             'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GF',
 111             'GR', 'HR', 'HU', 'IE', 'IS', 'IT', 'KN', 'LI',
 112             'LT', 'LU', 'LV', 'MC', 'MF', 'MQ', 'MT', 'NC',
 113             'NL', 'NO', 'PF', 'PL', 'PM', 'PT', 'RE', 'RO',
 114             'SE', 'SI', 'SK', 'SM', 'VA', 'WF', 'YT',
 115         },
 116     }
 117
 118     def _real_extract(self, url):
 119         mobj = self._match_valid_url(url)
 120         video_id = mobj.group('id')
 121         lang = mobj.group('lang') or mobj.group('lang_2')
 122         langauge_code = self._LANG_MAP.get(lang)
 123
 124         config = self._download_json(f'{self._API_BASE}/config/{lang}/{video_id}', video_id)
 125
 126         geoblocking = traverse_obj(config, ('data', 'attributes', 'restriction', 'geoblocking')) or {}
 127         if geoblocking.get('restrictedArea'):
 128             raise GeoRestrictedError(f'Video restricted to {geoblocking["code"]!r}',
 129                                      countries=self._COUNTRIES_MAP.get(geoblocking['code'], ('DE', 'FR')))
 130
 131         if not traverse_obj(config, ('data', 'attributes', 'rights')):
 132             # Eg: https://www.arte.tv/de/videos/097407-215-A/28-minuten
 133             # Eg: https://www.arte.tv/es/videos/104351-002-A/serviteur-du-peuple-1-23
 134             raise ExtractorError(
 135                 'Video is not available in this language edition of Arte or broadcast rights expired', expected=True)
 136
 137         formats, subtitles = [], {}
 138         for stream in config['data']['attributes']['streams']:
 139             # official player contains code like `e.get("versions")[0].eStat.ml5`
 140             stream_version = stream['versions'][0]
 141             stream_version_code = stream_version['eStat']['ml5']
 142
 143             lang_pref = -1
 144             m = self._VERSION_CODE_RE.match(stream_version_code)
 145             if m:
 146                 lang_pref = int(''.join('01'[x] for x in (
 147                     m.group('vlang') == langauge_code,      # we prefer voice in the requested language
 148                     not m.group('audio_desc'),              # and not the audio description version
 149                     bool(m.group('original_voice')),        # but if voice is not in the requested language, at least choose the original voice
 150                     m.group('sub_lang') == langauge_code,   # if subtitles are present, we prefer them in the requested language
 151                     not m.group('has_sub'),                 # but we prefer no subtitles otherwise
 152                     not m.group('sdh_sub'),                 # and we prefer not the hard-of-hearing subtitles if there are subtitles
 153                 )))
 154
 155             if stream['protocol'].startswith('HLS'):
 156                 fmts, subs = self._extract_m3u8_formats_and_subtitles(
 157                     stream['url'], video_id=video_id, ext='mp4', m3u8_id=stream_version_code, fatal=False)
 158                 for fmt in fmts:
 159                     fmt.update({
 160                         'format_note': f'{stream_version.get("label", "unknown")} [{stream_version.get("shortLabel", "?")}]',
 161                         'language_preference': lang_pref,
 162                     })
 163                 formats.extend(fmts)
 164                 self._merge_subtitles(subs, target=subtitles)
 165
 166             elif stream['protocol'] in ('HTTPS', 'RTMP'):
 167                 formats.append({
 168                     'format_id': f'{stream["protocol"]}-{stream_version_code}',
 169                     'url': stream['url'],
 170                     'format_note': f'{stream_version.get("label", "unknown")} [{stream_version.get("shortLabel", "?")}]',
 171                     'language_preference': lang_pref,
 172                     # 'ext': 'mp4',  # XXX: may or may not be necessary, at least for HTTPS
 173                 })
 174
 175             else:
 176                 self.report_warning(f'Skipping stream with unknown protocol {stream["protocol"]}')
 177
 178             # TODO: chapters from stream['segments']?
 179             # The JS also looks for chapters in config['data']['attributes']['chapters'],
 180             # but I am yet to find a video having those
 181
 182         self._sort_formats(formats)
 183
 184         metadata = config['data']['attributes']['metadata']
 185
 186         return {
 187             'id': metadata['providerId'],
 188             'webpage_url': traverse_obj(metadata, ('link', 'url')),
 189             'title': traverse_obj(metadata, 'subtitle', 'title'),
 190             'alt_title': metadata.get('subtitle') and metadata.get('title'),
 191             'description': metadata.get('description'),
 192             'duration': traverse_obj(metadata, ('duration', 'seconds')),
 193             'language': metadata.get('language'),
 194             'timestamp': traverse_obj(config, ('data', 'attributes', 'rights', 'begin'), expected_type=parse_iso8601),
 195             'is_live': config['data']['attributes'].get('live', False),
 196             'formats': formats,
 197             'subtitles': subtitles,
 198             'thumbnails': [
 199                 {'url': image['url'], 'id': image.get('caption')}
 200                 for image in metadata.get('images') or [] if url_or_none(image.get('url'))
 201             ],
 202         }
 203
 204
 205 class ArteTVEmbedIE(InfoExtractor):
 206     _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
 207     _TESTS = [{
 208         'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
 209         'info_dict': {
 210             'id': '100605-013-A',
 211             'ext': 'mp4',
 212             'title': 'United we Stream November Lockdown Edition #13',
 213             'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
 214             'upload_date': '20201116',
 215         },
 216         'skip': 'No video available'
 217     }, {
 218         'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
 219         'only_matching': True,
 220     }]
 221
 222     @staticmethod
 223     def _extract_urls(webpage):
 224         return [url for _, url in re.findall(
 225             r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1',
 226             webpage)]
 227
 228     def _real_extract(self, url):
 229         qs = parse_qs(url)
 230         json_url = qs['json_url'][0]
 231         video_id = ArteTVIE._match_id(json_url)
 232         return self.url_result(
 233             json_url, ie=ArteTVIE.ie_key(), video_id=video_id)
 234
 235
 236 class ArteTVPlaylistIE(ArteTVBaseIE):
 237     _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES
 238     _TESTS = [{
 239         'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
 240         'only_matching': True,
 241     }, {
 242         'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
 243         'playlist_mincount': 100,
 244         'info_dict': {
 245             'description': 'md5:84e7bf1feda248bc325ebfac818c476e',
 246             'id': 'RC-014123',
 247             'title': 'ARTE Reportage - najlepsze reportaże',
 248         },
 249     }]
 250
 251     def _real_extract(self, url):
 252         lang, playlist_id = self._match_valid_url(url).group('lang', 'id')
 253         playlist = self._download_json(
 254             f'{self._API_BASE}/playlist/{lang}/{playlist_id}', playlist_id)['data']['attributes']
 255
 256         entries = [{
 257             '_type': 'url_transparent',
 258             'url': video['config']['url'],
 259             'ie_key': ArteTVIE.ie_key(),
 260             'id': video.get('providerId'),
 261             'title': video.get('title'),
 262             'alt_title': video.get('subtitle'),
 263             'thumbnail': url_or_none(traverse_obj(video, ('mainImage', 'url'))),
 264             'duration': int_or_none(traverse_obj(video, ('duration', 'seconds'))),
 265         } for video in traverse_obj(playlist, ('items', lambda _, v: v['config']['url']))]
 266
 267         return self.playlist_result(entries, playlist_id,
 268                                     traverse_obj(playlist, ('metadata', 'title')),
 269                                     traverse_obj(playlist, ('metadata', 'description')))
 270
 271
 272 class ArteTVCategoryIE(ArteTVBaseIE):
 273     _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE._ARTE_LANGUAGES
 274     _TESTS = [{
 275         'url': 'https://www.arte.tv/en/videos/politics-and-society/',
 276         'info_dict': {
 277             'id': 'politics-and-society',
 278             'title': 'Politics and society',
 279             'description': 'Investigative documentary series, geopolitical analysis, and international commentary',
 280         },
 281         'playlist_mincount': 13,
 282     }]
 283
 284     @classmethod
 285     def suitable(cls, url):
 286         return (
 287             not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, ))
 288             and super().suitable(url))
 289
 290     def _real_extract(self, url):
 291         lang, playlist_id = self._match_valid_url(url).groups()
 292         webpage = self._download_webpage(url, playlist_id)
 293
 294         items = []
 295         for video in re.finditer(
 296                 r'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|\b)(?P<url>https?://www\.arte\.tv/%s/videos/[\w/-]+)(?P=q)' % lang,
 297                 webpage):
 298             video = video.group('url')
 299             if video == url:
 300                 continue
 301             if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )):
 302                 items.append(video)
 303
 304         title = (self._og_search_title(webpage, default=None)
 305                  or self._html_search_regex(r'<title\b[^>]*>([^<]+)</title>', default=None))
 306         title = strip_or_none(title.rsplit('|', 1)[0]) or self._generic_title(url)
 307
 308         return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title,
 309                                           description=self._og_search_description(webpage, default=None))