yt_dlp/extractor/francetv.py

   1 import urllib.parse
   2
   3 from .common import InfoExtractor
   4 from .dailymotion import DailymotionIE
   5 from ..networking import HEADRequest
   6 from ..utils import (
   7     ExtractorError,
   8     determine_ext,
   9     filter_dict,
  10     format_field,
  11     int_or_none,
  12     join_nonempty,
  13     parse_iso8601,
  14     parse_qs,
  15     smuggle_url,
  16     unsmuggle_url,
  17     url_or_none,
  18 )
  19 from ..utils.traversal import traverse_obj
  20
  21
  22 class FranceTVBaseInfoExtractor(InfoExtractor):
  23     def _make_url_result(self, video_or_full_id, catalog=None, url=None):
  24         full_id = 'francetv:%s' % video_or_full_id
  25         if '@' not in video_or_full_id and catalog:
  26             full_id += '@%s' % catalog
  27         if url:
  28             full_id = smuggle_url(full_id, {'hostname': urllib.parse.urlparse(url).hostname})
  29         return self.url_result(
  30             full_id, ie=FranceTVIE.ie_key(),
  31             video_id=video_or_full_id.split('@')[0])
  32
  33
  34 class FranceTVIE(InfoExtractor):
  35     _VALID_URL = r'''(?x)
  36                     (?:
  37                         https?://
  38                             sivideo\.webservices\.francetelevisions\.fr/tools/getInfosOeuvre/v2/\?
  39                             .*?\bidDiffusion=[^&]+|
  40                         (?:
  41                             https?://videos\.francetv\.fr/video/|
  42                             francetv:
  43                         )
  44                         (?P<id>[^@]+)(?:@(?P<catalog>.+))?
  45                     )
  46                     '''
  47     _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?://)?embed\.francetv\.fr/\?ue=.+?)\1']
  48     _GEO_COUNTRIES = ['FR']
  49     _GEO_BYPASS = False
  50
  51     _TESTS = [{
  52         # without catalog
  53         'url': 'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/?idDiffusion=162311093&callback=_jsonp_loader_callback_request_0',
  54         'md5': 'c2248a8de38c4e65ea8fae7b5df2d84f',
  55         'info_dict': {
  56             'id': '162311093',
  57             'ext': 'mp4',
  58             'title': '13h15, le dimanche... - Les mystères de Jésus',
  59             'description': 'md5:75efe8d4c0a8205e5904498ffe1e1a42',
  60             'timestamp': 1502623500,
  61             'upload_date': '20170813',
  62         },
  63     }, {
  64         # with catalog
  65         'url': 'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/?idDiffusion=NI_1004933&catalogue=Zouzous&callback=_jsonp_loader_callback_request_4',
  66         'only_matching': True,
  67     }, {
  68         'url': 'http://videos.francetv.fr/video/NI_657393@Regions',
  69         'only_matching': True,
  70     }, {
  71         'url': 'francetv:162311093',
  72         'only_matching': True,
  73     }, {
  74         'url': 'francetv:NI_1004933@Zouzous',
  75         'only_matching': True,
  76     }, {
  77         'url': 'francetv:NI_983319@Info-web',
  78         'only_matching': True,
  79     }, {
  80         'url': 'francetv:NI_983319',
  81         'only_matching': True,
  82     }, {
  83         'url': 'francetv:NI_657393@Regions',
  84         'only_matching': True,
  85     }, {
  86         # france-3 live
  87         'url': 'francetv:SIM_France3',
  88         'only_matching': True,
  89     }]
  90
  91     def _extract_video(self, video_id, catalogue=None, hostname=None):
  92         # TODO: Investigate/remove 'catalogue'/'catalog'; it has not been used since 2021
  93         is_live = None
  94         videos = []
  95         title = None
  96         subtitle = None
  97         episode_number = None
  98         season_number = None
  99         image = None
 100         duration = None
 101         timestamp = None
 102         spritesheets = None
 103
 104         for device_type in ('desktop', 'mobile'):
 105             dinfo = self._download_json(
 106                 'https://player.webservices.francetelevisions.fr/v1/videos/%s' % video_id,
 107                 video_id, f'Downloading {device_type} video JSON', query=filter_dict({
 108                     'device_type': device_type,
 109                     'browser': 'chrome',
 110                     'domain': hostname,
 111                 }), fatal=False)
 112
 113             if not dinfo:
 114                 continue
 115
 116             video = traverse_obj(dinfo, ('video', {dict}))
 117             if video:
 118                 videos.append(video)
 119                 if duration is None:
 120                     duration = video.get('duration')
 121                 if is_live is None:
 122                     is_live = video.get('is_live')
 123                 if spritesheets is None:
 124                     spritesheets = video.get('spritesheets')
 125
 126             meta = traverse_obj(dinfo, ('meta', {dict}))
 127             if meta:
 128                 if title is None:
 129                     title = meta.get('title')
 130                 # meta['pre_title'] contains season and episode number for series in format "S<ID> E<ID>"
 131                 season_number, episode_number = self._search_regex(
 132                     r'S(\d+)\s*E(\d+)', meta.get('pre_title'), 'episode info', group=(1, 2), default=(None, None))
 133                 if subtitle is None:
 134                     subtitle = meta.get('additional_title')
 135                 if image is None:
 136                     image = meta.get('image_url')
 137                 if timestamp is None:
 138                     timestamp = parse_iso8601(meta.get('broadcasted_at'))
 139
 140         formats, subtitles, video_url = [], {}, None
 141         for video in traverse_obj(videos, lambda _, v: url_or_none(v['url'])):
 142             video_url = video['url']
 143             format_id = video.get('format')
 144
 145             token_url = url_or_none(video.get('token'))
 146             if token_url and video.get('workflow') == 'token-akamai':
 147                 tokenized_url = traverse_obj(self._download_json(
 148                     token_url, video_id, f'Downloading signed {format_id} manifest URL',
 149                     fatal=False, query={
 150                         'format': 'json',
 151                         'url': video_url,
 152                     }), ('url', {url_or_none}))
 153                 if tokenized_url:
 154                     video_url = tokenized_url
 155
 156             ext = determine_ext(video_url)
 157             if ext == 'f4m':
 158                 formats.extend(self._extract_f4m_formats(
 159                     video_url, video_id, f4m_id=format_id, fatal=False))
 160             elif ext == 'm3u8':
 161                 fmts, subs = self._extract_m3u8_formats_and_subtitles(
 162                     video_url, video_id, 'mp4',
 163                     entry_protocol='m3u8_native', m3u8_id=format_id,
 164                     fatal=False)
 165                 formats.extend(fmts)
 166                 self._merge_subtitles(subs, target=subtitles)
 167             elif ext == 'mpd':
 168                 fmts, subs = self._extract_mpd_formats_and_subtitles(
 169                     video_url, video_id, mpd_id=format_id, fatal=False)
 170                 formats.extend(fmts)
 171                 self._merge_subtitles(subs, target=subtitles)
 172             elif video_url.startswith('rtmp'):
 173                 formats.append({
 174                     'url': video_url,
 175                     'format_id': 'rtmp-%s' % format_id,
 176                     'ext': 'flv',
 177                 })
 178             else:
 179                 if self._is_valid_url(video_url, video_id, format_id):
 180                     formats.append({
 181                         'url': video_url,
 182                         'format_id': format_id,
 183                     })
 184
 185             # XXX: what is video['captions']?
 186
 187         if not formats and video_url:
 188             urlh = self._request_webpage(
 189                 HEADRequest(video_url), video_id, 'Checking for geo-restriction',
 190                 fatal=False, expected_status=403)
 191             if urlh and urlh.headers.get('x-errortype') == 'geo':
 192                 self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True)
 193
 194         for f in formats:
 195             if f.get('acodec') != 'none' and f.get('language') in ('qtz', 'qad'):
 196                 f['language_preference'] = -10
 197                 f['format_note'] = 'audio description%s' % format_field(f, 'format_note', ', %s')
 198
 199         if spritesheets:
 200             formats.append({
 201                 'format_id': 'spritesheets',
 202                 'format_note': 'storyboard',
 203                 'acodec': 'none',
 204                 'vcodec': 'none',
 205                 'ext': 'mhtml',
 206                 'protocol': 'mhtml',
 207                 'url': 'about:invalid',
 208                 'fragments': [{
 209                     'url': sheet,
 210                     # XXX: not entirely accurate; each spritesheet seems to be
 211                     # a 10×10 grid of thumbnails corresponding to approximately
 212                     # 2 seconds of the video; the last spritesheet may be shorter
 213                     'duration': 200,
 214                 } for sheet in spritesheets]
 215             })
 216
 217         return {
 218             'id': video_id,
 219             'title': join_nonempty(title, subtitle, delim=' - ').strip(),
 220             'thumbnail': image,
 221             'duration': duration,
 222             'timestamp': timestamp,
 223             'is_live': is_live,
 224             'formats': formats,
 225             'subtitles': subtitles,
 226             'episode': subtitle if episode_number else None,
 227             'series': title if episode_number else None,
 228             'episode_number': int_or_none(episode_number),
 229             'season_number': int_or_none(season_number),
 230         }
 231
 232     def _real_extract(self, url):
 233         url, smuggled_data = unsmuggle_url(url, {})
 234         mobj = self._match_valid_url(url)
 235         video_id = mobj.group('id')
 236         catalog = mobj.group('catalog')
 237
 238         if not video_id:
 239             qs = parse_qs(url)
 240             video_id = qs.get('idDiffusion', [None])[0]
 241             catalog = qs.get('catalogue', [None])[0]
 242             if not video_id:
 243                 raise ExtractorError('Invalid URL', expected=True)
 244
 245         return self._extract_video(video_id, catalog, hostname=smuggled_data.get('hostname'))
 246
 247
 248 class FranceTVSiteIE(FranceTVBaseInfoExtractor):
 249     _VALID_URL = r'https?://(?:(?:www\.)?france\.tv|mobile\.france\.tv)/(?:[^/]+/)*(?P<id>[^/]+)\.html'
 250
 251     _TESTS = [{
 252         'url': 'https://www.france.tv/france-2/13h15-le-dimanche/140921-les-mysteres-de-jesus.html',
 253         'info_dict': {
 254             'id': 'ec217ecc-0733-48cf-ac06-af1347b849d1',
 255             'ext': 'mp4',
 256             'title': '13h15, le dimanche... - Les mystères de Jésus',
 257             'timestamp': 1502623500,
 258             'duration': 2580,
 259             'thumbnail': r're:^https?://.*\.jpg$',
 260             'upload_date': '20170813',
 261         },
 262         'params': {
 263             'skip_download': True,
 264         },
 265         'add_ie': [FranceTVIE.ie_key()],
 266     }, {
 267         'url': 'https://www.france.tv/enfants/six-huit-ans/foot2rue/saison-1/3066387-duel-au-vieux-port.html',
 268         'info_dict': {
 269             'id': 'a9050959-eedd-4b4a-9b0d-de6eeaa73e44',
 270             'ext': 'mp4',
 271             'title': 'Foot2Rue - Duel au vieux port',
 272             'episode': 'Duel au vieux port',
 273             'series': 'Foot2Rue',
 274             'episode_number': 1,
 275             'season_number': 1,
 276             'timestamp': 1642761360,
 277             'upload_date': '20220121',
 278             'season': 'Season 1',
 279             'thumbnail': r're:^https?://.*\.jpg$',
 280             'duration': 1441,
 281         },
 282     }, {
 283         # france3
 284         'url': 'https://www.france.tv/france-3/des-chiffres-et-des-lettres/139063-emission-du-mardi-9-mai-2017.html',
 285         'only_matching': True,
 286     }, {
 287         # france4
 288         'url': 'https://www.france.tv/france-4/hero-corp/saison-1/134151-apres-le-calme.html',
 289         'only_matching': True,
 290     }, {
 291         # france5
 292         'url': 'https://www.france.tv/france-5/c-a-dire/saison-10/137013-c-a-dire.html',
 293         'only_matching': True,
 294     }, {
 295         # franceo
 296         'url': 'https://www.france.tv/france-o/archipels/132249-mon-ancetre-l-esclave.html',
 297         'only_matching': True,
 298     }, {
 299         # france2 live
 300         'url': 'https://www.france.tv/france-2/direct.html',
 301         'only_matching': True,
 302     }, {
 303         'url': 'https://www.france.tv/documentaires/histoire/136517-argentine-les-500-bebes-voles-de-la-dictature.html',
 304         'only_matching': True,
 305     }, {
 306         'url': 'https://www.france.tv/jeux-et-divertissements/divertissements/133965-le-web-contre-attaque.html',
 307         'only_matching': True,
 308     }, {
 309         'url': 'https://mobile.france.tv/france-5/c-dans-l-air/137347-emission-du-vendredi-12-mai-2017.html',
 310         'only_matching': True,
 311     }, {
 312         'url': 'https://www.france.tv/142749-rouge-sang.html',
 313         'only_matching': True,
 314     }, {
 315         # france-3 live
 316         'url': 'https://www.france.tv/france-3/direct.html',
 317         'only_matching': True,
 318     }]
 319
 320     def _real_extract(self, url):
 321         display_id = self._match_id(url)
 322
 323         webpage = self._download_webpage(url, display_id)
 324
 325         catalogue = None
 326         video_id = self._search_regex(
 327             r'(?:data-main-video\s*=|videoId["\']?\s*[:=])\s*(["\'])(?P<id>(?:(?!\1).)+)\1',
 328             webpage, 'video id', default=None, group='id')
 329
 330         if not video_id:
 331             video_id, catalogue = self._html_search_regex(
 332                 r'(?:href=|player\.setVideo\(\s*)"http://videos?\.francetv\.fr/video/([^@]+@[^"]+)"',
 333                 webpage, 'video ID').split('@')
 334
 335         return self._make_url_result(video_id, catalogue, url=url)
 336
 337
 338 class FranceTVInfoIE(FranceTVBaseInfoExtractor):
 339     IE_NAME = 'francetvinfo.fr'
 340     _VALID_URL = r'https?://(?:www|mobile|france3-regions)\.francetvinfo\.fr/(?:[^/]+/)*(?P<id>[^/?#&.]+)'
 341
 342     _TESTS = [{
 343         'url': 'https://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-jeudi-22-aout-2019_3561461.html',
 344         'info_dict': {
 345             'id': 'd12458ee-5062-48fe-bfdd-a30d6a01b793',
 346             'ext': 'mp4',
 347             'title': 'Soir 3',
 348             'upload_date': '20190822',
 349             'timestamp': 1566510900,
 350             'description': 'md5:72d167097237701d6e8452ff03b83c00',
 351             'subtitles': {
 352                 'fr': 'mincount:2',
 353             },
 354         },
 355         'params': {
 356             'skip_download': True,
 357         },
 358         'add_ie': [FranceTVIE.ie_key()],
 359     }, {
 360         'note': 'Only an image exists in initial webpage instead of the video',
 361         'url': 'https://www.francetvinfo.fr/sante/maladie/coronavirus/covid-19-en-inde-une-situation-catastrophique-a-new-dehli_4381095.html',
 362         'info_dict': {
 363             'id': '7d204c9e-a2d3-11eb-9e4c-000d3a23d482',
 364             'ext': 'mp4',
 365             'title': 'Covid-19 : une situation catastrophique à New Dehli',
 366             'thumbnail': str,
 367             'duration': 76,
 368             'timestamp': 1619028518,
 369             'upload_date': '20210421',
 370         },
 371         'params': {
 372             'skip_download': True,
 373         },
 374         'add_ie': [FranceTVIE.ie_key()],
 375     }, {
 376         'url': 'http://www.francetvinfo.fr/elections/europeennes/direct-europeennes-regardez-le-debat-entre-les-candidats-a-la-presidence-de-la-commission_600639.html',
 377         'only_matching': True,
 378     }, {
 379         'url': 'http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html',
 380         'only_matching': True,
 381     }, {
 382         'url': 'http://france3-regions.francetvinfo.fr/bretagne/cotes-d-armor/thalassa-echappee-breizh-ce-venredi-dans-les-cotes-d-armor-954961.html',
 383         'only_matching': True,
 384     }, {
 385         # Dailymotion embed
 386         'url': 'http://www.francetvinfo.fr/politique/notre-dame-des-landes/video-sur-france-inter-cecile-duflot-denonce-le-regard-meprisant-de-patrick-cohen_1520091.html',
 387         'md5': 'ee7f1828f25a648addc90cb2687b1f12',
 388         'info_dict': {
 389             'id': 'x4iiko0',
 390             'ext': 'mp4',
 391             'title': 'NDDL, référendum, Brexit : Cécile Duflot répond à Patrick Cohen',
 392             'description': 'Au lendemain de la victoire du "oui" au référendum sur l\'aéroport de Notre-Dame-des-Landes, l\'ancienne ministre écologiste est l\'invitée de Patrick Cohen. Plus d\'info : https://www.franceinter.fr/emissions/le-7-9/le-7-9-27-juin-2016',
 393             'timestamp': 1467011958,
 394             'upload_date': '20160627',
 395             'uploader': 'France Inter',
 396             'uploader_id': 'x2q2ez',
 397         },
 398         'add_ie': ['Dailymotion'],
 399     }, {
 400         'url': 'http://france3-regions.francetvinfo.fr/limousin/emissions/jt-1213-limousin',
 401         'only_matching': True,
 402     }, {
 403         # "<figure id=" pattern (#28792)
 404         'url': 'https://www.francetvinfo.fr/culture/patrimoine/incendie-de-notre-dame-de-paris/notre-dame-de-paris-de-l-incendie-de-la-cathedrale-a-sa-reconstruction_4372291.html',
 405         'only_matching': True,
 406     }]
 407
 408     def _real_extract(self, url):
 409         display_id = self._match_id(url)
 410
 411         webpage = self._download_webpage(url, display_id)
 412
 413         dailymotion_urls = tuple(DailymotionIE._extract_embed_urls(url, webpage))
 414         if dailymotion_urls:
 415             return self.playlist_result([
 416                 self.url_result(dailymotion_url, DailymotionIE.ie_key())
 417                 for dailymotion_url in dailymotion_urls])
 418
 419         video_id = self._search_regex(
 420             (r'player\.load[^;]+src:\s*["\']([^"\']+)',
 421              r'id-video=([^@]+@[^"]+)',
 422              r'<a[^>]+href="(?:https?:)?//videos\.francetv\.fr/video/([^@]+@[^"]+)"',
 423              r'(?:data-id|<figure[^<]+\bid)=["\']([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'),
 424             webpage, 'video id')
 425
 426         return self._make_url_result(video_id, url=url)