[ie/francetv] Improve metadata extraction (#8409)

[yt-dlp.git] / yt_dlp / extractor / francetv.py
diff --git a/yt_dlp/extractor/francetv.py b/yt_dlp/extractor/francetv.py

index 41910cefb1f6b03a20bf86fc3c4259e05dd206d3..0ceecde74c7335b26dc3a0453bae3a0bdfd4ddc5 100644 (file)
--- a/yt_dlp/extractor/francetv.py
+++ b/yt_dlp/extractor/francetv.py
@@ -1,24 +1,14 @@
-# coding: utf-8
-
-from __future__ import unicode_literals
-
-
  from .common import InfoExtractor
-from ..compat import (
-    compat_str,
-)
+from .dailymotion import DailymotionIE
  from ..utils import (
-    clean_html,
-    determine_ext,
      ExtractorError,
+    determine_ext,
+    format_field,
      int_or_none,
-    parse_duration,
+    join_nonempty,
+    parse_iso8601,
      parse_qs,
-    try_get,
-    url_or_none,
-    urljoin,
  )
-from .dailymotion import DailymotionIE
  
  
  class FranceTVBaseInfoExtractor(InfoExtractor):
@@ -44,6 +34,7 @@ class FranceTVIE(InfoExtractor):
                          (?P<id>[^@]+)(?:@(?P<catalog>.+))?
                      )
                      '''
+    _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?://)?embed\.francetv\.fr/\?ue=.+?)\1']
  
      _TESTS = [{
          # without catalog
@@ -89,97 +80,85 @@ def _extract_video(self, video_id, catalogue=None):
          # Videos are identified by idDiffusion so catalogue part is optional.
          # However when provided, some extra formats may be returned so we pass
          # it if available.
-        info = self._download_json(
-            'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/',
-            video_id, 'Downloading video JSON', query={
-                'idDiffusion': video_id,
-                'catalogue': catalogue or '',
-            })
-
-        if info.get('status') == 'NOK':
-            raise ExtractorError(
-                '%s returned error: %s' % (self.IE_NAME, info['message']),
-                expected=True)
-        allowed_countries = info['videos'][0].get('geoblocage')
-        if allowed_countries:
-            georestricted = True
-            geo_info = self._download_json(
-                'http://geo.francetv.fr/ws/edgescape.json', video_id,
-                'Downloading geo restriction info')
-            country = geo_info['reponse']['geo_info']['country_code']
-            if country not in allowed_countries:
-                raise ExtractorError(
-                    'The video is not available from your location',
-                    expected=True)
-        else:
-            georestricted = False
-
-        def sign(manifest_url, manifest_id):
-            for host in ('hdfauthftv-a.akamaihd.net', 'hdfauth.francetv.fr'):
-                signed_url = url_or_none(self._download_webpage(
-                    'https://%s/esi/TA' % host, video_id,
-                    'Downloading signed %s manifest URL' % manifest_id,
-                    fatal=False, query={
-                        'url': manifest_url,
-                    }))
-                if signed_url:
-                    return signed_url
-            return manifest_url
-
          is_live = None
-
          videos = []
-
-        for video in (info.get('videos') or []):
-            if video.get('statut') != 'ONLINE':
-                continue
-            if not video.get('url'):
+        title = None
+        subtitle = None
+        episode_number = None
+        season_number = None
+        image = None
+        duration = None
+        timestamp = None
+        spritesheets = None
+
+        for device_type in ('desktop', 'mobile'):
+            dinfo = self._download_json(
+                'https://player.webservices.francetelevisions.fr/v1/videos/%s' % video_id,
+                video_id, 'Downloading %s video JSON' % device_type, query={
+                    'device_type': device_type,
+                    'browser': 'chrome',
+                }, fatal=False)
+
+            if not dinfo:
                  continue
-            videos.append(video)
-
-        if not videos:
-            for device_type in ['desktop', 'mobile']:
-                fallback_info = self._download_json(
-                    'https://player.webservices.francetelevisions.fr/v1/videos/%s' % video_id,
-                    video_id, 'Downloading fallback %s video JSON' % device_type, query={
-                        'device_type': device_type,
-                        'browser': 'chrome',
-                    }, fatal=False)
  
-                if fallback_info and fallback_info.get('video'):
-                    videos.append(fallback_info['video'])
+            video = dinfo.get('video')
+            if video:
+                videos.append(video)
+                if duration is None:
+                    duration = video.get('duration')
+                if is_live is None:
+                    is_live = video.get('is_live')
+                if spritesheets is None:
+                    spritesheets = video.get('spritesheets')
+
+            meta = dinfo.get('meta')
+            if meta:
+                if title is None:
+                    title = meta.get('title')
+                # meta['pre_title'] contains season and episode number for series in format "S<ID> E<ID>"
+                season_number, episode_number = self._search_regex(
+                    r'S(\d+)\s*E(\d+)', meta.get('pre_title'), 'episode info', group=(1, 2), default=(None, None))
+                if subtitle is None:
+                    subtitle = meta.get('additional_title')
+                if image is None:
+                    image = meta.get('image_url')
+                if timestamp is None:
+                    timestamp = parse_iso8601(meta.get('broadcasted_at'))
  
          formats = []
          subtitles = {}
          for video in videos:
-            video_url = video.get('url')
-            if not video_url:
-                continue
-            if is_live is None:
-                is_live = (try_get(
-                    video, lambda x: x['plages_ouverture'][0]['direct'], bool) is True
-                    or video.get('is_live') is True
-                    or '/live.francetv.fr/' in video_url)
              format_id = video.get('format')
+
+            video_url = None
+            if video.get('workflow') == 'token-akamai':
+                token_url = video.get('token')
+                if token_url:
+                    token_json = self._download_json(
+                        token_url, video_id,
+                        'Downloading signed %s manifest URL' % format_id)
+                    if token_json:
+                        video_url = token_json.get('url')
+            if not video_url:
+                video_url = video.get('url')
+
              ext = determine_ext(video_url)
              if ext == 'f4m':
-                if georestricted:
-                    # See https://github.com/ytdl-org/youtube-dl/issues/3963
-                    # m3u8 urls work fine
-                    continue
                  formats.extend(self._extract_f4m_formats(
-                    sign(video_url, format_id) + '&hdcore=3.7.0&plugin=aasp-3.7.0.39.44',
-                    video_id, f4m_id=format_id, fatal=False))
+                    video_url, video_id, f4m_id=format_id, fatal=False))
              elif ext == 'm3u8':
-                m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
-                    sign(video_url, format_id), video_id, 'mp4',
+                fmts, subs = self._extract_m3u8_formats_and_subtitles(
+                    video_url, video_id, 'mp4',
                      entry_protocol='m3u8_native', m3u8_id=format_id,
                      fatal=False)
-                formats.extend(m3u8_fmts)
-                subtitles = self._merge_subtitles(subtitles, m3u8_subs)
+                formats.extend(fmts)
+                self._merge_subtitles(subs, target=subtitles)
              elif ext == 'mpd':
-                formats.extend(self._extract_mpd_formats(
-                    sign(video_url, format_id), video_id, mpd_id=format_id, fatal=False))
+                fmts, subs = self._extract_mpd_formats_and_subtitles(
+                    video_url, video_id, mpd_id=format_id, fatal=False)
+                formats.extend(fmts)
+                self._merge_subtitles(subs, target=subtitles)
              elif video_url.startswith('rtmp'):
                  formats.append({
                      'url': video_url,
@@ -193,31 +172,44 @@ def sign(manifest_url, manifest_id):
                          'format_id': format_id,
                      })
  
-        self._sort_formats(formats)
-
-        title = info['titre']
-        subtitle = info.get('sous_titre')
-        if subtitle:
-            title += ' - %s' % subtitle
-        title = title.strip()
-
-        subtitles.setdefault('fr', []).extend(
-            [{
-                'url': subformat['url'],
-                'ext': subformat.get('format'),
-            } for subformat in info.get('subtitles', []) if subformat.get('url')]
-        )
+            # XXX: what is video['captions']?
+
+        for f in formats:
+            if f.get('acodec') != 'none' and f.get('language') in ('qtz', 'qad'):
+                f['language_preference'] = -10
+                f['format_note'] = 'audio description%s' % format_field(f, 'format_note', ', %s')
+
+        if spritesheets:
+            formats.append({
+                'format_id': 'spritesheets',
+                'format_note': 'storyboard',
+                'acodec': 'none',
+                'vcodec': 'none',
+                'ext': 'mhtml',
+                'protocol': 'mhtml',
+                'url': 'about:invalid',
+                'fragments': [{
+                    'url': sheet,
+                    # XXX: not entirely accurate; each spritesheet seems to be
+                    # a 10×10 grid of thumbnails corresponding to approximately
+                    # 2 seconds of the video; the last spritesheet may be shorter
+                    'duration': 200,
+                } for sheet in spritesheets]
+            })
  
          return {
              'id': video_id,
-            'title': self._live_title(title) if is_live else title,
-            'description': clean_html(info.get('synopsis')),
-            'thumbnail': urljoin('https://sivideo.webservices.francetelevisions.fr', info.get('image')),
-            'duration': int_or_none(info.get('real_duration')) or parse_duration(info.get('duree')),
-            'timestamp': int_or_none(try_get(info, lambda x: x['diffusion']['timestamp'])),
+            'title': join_nonempty(title, subtitle, delim=' - ').strip(),
+            'thumbnail': image,
+            'duration': duration,
+            'timestamp': timestamp,
              'is_live': is_live,
              'formats': formats,
              'subtitles': subtitles,
+            'episode': subtitle if episode_number else None,
+            'series': title if episode_number else None,
+            'episode_number': int_or_none(episode_number),
+            'season_number': int_or_none(season_number),
          }
  
      def _real_extract(self, url):
@@ -244,14 +236,31 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor):
              'id': 'ec217ecc-0733-48cf-ac06-af1347b849d1',
              'ext': 'mp4',
              'title': '13h15, le dimanche... - Les mystères de Jésus',
-            'description': 'md5:75efe8d4c0a8205e5904498ffe1e1a42',
              'timestamp': 1502623500,
+            'duration': 2580,
+            'thumbnail': r're:^https?://.*\.jpg$',
              'upload_date': '20170813',
          },
          'params': {
              'skip_download': True,
          },
          'add_ie': [FranceTVIE.ie_key()],
+    }, {
+        'url': 'https://www.france.tv/enfants/six-huit-ans/foot2rue/saison-1/3066387-duel-au-vieux-port.html',
+        'info_dict': {
+            'id': 'a9050959-eedd-4b4a-9b0d-de6eeaa73e44',
+            'ext': 'mp4',
+            'title': 'Foot2Rue - Duel au vieux port',
+            'episode': 'Duel au vieux port',
+            'series': 'Foot2Rue',
+            'episode_number': 1,
+            'season_number': 1,
+            'timestamp': 1642761360,
+            'upload_date': '20220121',
+            'season': 'Season 1',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 1441,
+        },
      }, {
          # france3
          'url': 'https://www.france.tv/france-3/des-chiffres-et-des-lettres/139063-emission-du-mardi-9-mai-2017.html',
@@ -308,35 +317,6 @@ def _real_extract(self, url):
          return self._make_url_result(video_id, catalogue)
  
  
-class FranceTVEmbedIE(FranceTVBaseInfoExtractor):
-    _VALID_URL = r'https?://embed\.francetv\.fr/*\?.*?\bue=(?P<id>[^&]+)'
-
-    _TESTS = [{
-        'url': 'http://embed.francetv.fr/?ue=7fd581a2ccf59d2fc5719c5c13cf6961',
-        'info_dict': {
-            'id': 'NI_983319',
-            'ext': 'mp4',
-            'title': 'Le Pen Reims',
-            'upload_date': '20170505',
-            'timestamp': 1493981780,
-            'duration': 16,
-        },
-        'params': {
-            'skip_download': True,
-        },
-        'add_ie': [FranceTVIE.ie_key()],
-    }]
-
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-
-        video = self._download_json(
-            'http://api-embed.webservices.francetelevisions.fr/key/%s' % video_id,
-            video_id)
-
-        return self._make_url_result(video['video_id'], video.get('catalog'))
-
-
  class FranceTVInfoIE(FranceTVBaseInfoExtractor):
      IE_NAME = 'francetvinfo.fr'
      _VALID_URL = r'https?://(?:www|mobile|france3-regions)\.francetvinfo\.fr/(?:[^/]+/)*(?P<id>[^/?#&.]+)'
@@ -412,7 +392,7 @@ def _real_extract(self, url):
  
          webpage = self._download_webpage(url, display_id)
  
-        dailymotion_urls = DailymotionIE._extract_urls(webpage)
+        dailymotion_urls = tuple(DailymotionIE._extract_embed_urls(url, webpage))
          if dailymotion_urls:
              return self.playlist_result([
                  self.url_result(dailymotion_url, DailymotionIE.ie_key())
@@ -426,139 +406,3 @@ def _real_extract(self, url):
              webpage, 'video id')
  
          return self._make_url_result(video_id)
-
-
-class FranceTVInfoSportIE(FranceTVBaseInfoExtractor):
-    IE_NAME = 'sport.francetvinfo.fr'
-    _VALID_URL = r'https?://sport\.francetvinfo\.fr/(?:[^/]+/)*(?P<id>[^/?#&]+)'
-    _TESTS = [{
-        'url': 'https://sport.francetvinfo.fr/les-jeux-olympiques/retour-sur-les-meilleurs-moments-de-pyeongchang-2018',
-        'info_dict': {
-            'id': '6e49080e-3f45-11e8-b459-000d3a2439ea',
-            'ext': 'mp4',
-            'title': 'Retour sur les meilleurs moments de Pyeongchang 2018',
-            'timestamp': 1523639962,
-            'upload_date': '20180413',
-        },
-        'params': {
-            'skip_download': True,
-        },
-        'add_ie': [FranceTVIE.ie_key()],
-    }]
-
-    def _real_extract(self, url):
-        display_id = self._match_id(url)
-        webpage = self._download_webpage(url, display_id)
-        video_id = self._search_regex(r'data-video="([^"]+)"', webpage, 'video_id')
-        return self._make_url_result(video_id, 'Sport-web')
-
-
-class GenerationWhatIE(InfoExtractor):
-    IE_NAME = 'france2.fr:generation-what'
-    _VALID_URL = r'https?://generation-what\.francetv\.fr/[^/]+/video/(?P<id>[^/?#&]+)'
-
-    _TESTS = [{
-        'url': 'http://generation-what.francetv.fr/portrait/video/present-arms',
-        'info_dict': {
-            'id': 'wtvKYUG45iw',
-            'ext': 'mp4',
-            'title': 'Generation What - Garde à vous - FRA',
-            'uploader': 'Generation What',
-            'uploader_id': 'UCHH9p1eetWCgt4kXBYCb3_w',
-            'upload_date': '20160411',
-        },
-        'params': {
-            'skip_download': True,
-        },
-        'add_ie': ['Youtube'],
-    }, {
-        'url': 'http://generation-what.francetv.fr/europe/video/present-arms',
-        'only_matching': True,
-    }]
-
-    def _real_extract(self, url):
-        display_id = self._match_id(url)
-
-        webpage = self._download_webpage(url, display_id)
-
-        youtube_id = self._search_regex(
-            r"window\.videoURL\s*=\s*'([0-9A-Za-z_-]{11})';",
-            webpage, 'youtube id')
-
-        return self.url_result(youtube_id, ie='Youtube', video_id=youtube_id)
-
-
-class CultureboxIE(FranceTVBaseInfoExtractor):
-    _VALID_URL = r'https?://(?:m\.)?culturebox\.francetvinfo\.fr/(?:[^/]+/)*(?P<id>[^/?#&]+)'
-
-    _TESTS = [{
-        'url': 'https://culturebox.francetvinfo.fr/opera-classique/musique-classique/c-est-baroque/concerts/cantates-bwv-4-106-et-131-de-bach-par-raphael-pichon-57-268689',
-        'info_dict': {
-            'id': 'EV_134885',
-            'ext': 'mp4',
-            'title': 'Cantates BWV 4, 106 et 131 de Bach par Raphaël Pichon 5/7',
-            'description': 'md5:19c44af004b88219f4daa50fa9a351d4',
-            'upload_date': '20180206',
-            'timestamp': 1517945220,
-            'duration': 5981,
-        },
-        'params': {
-            'skip_download': True,
-        },
-        'add_ie': [FranceTVIE.ie_key()],
-    }]
-
-    def _real_extract(self, url):
-        display_id = self._match_id(url)
-
-        webpage = self._download_webpage(url, display_id)
-
-        if ">Ce live n'est plus disponible en replay<" in webpage:
-            raise ExtractorError(
-                'Video %s is not available' % display_id, expected=True)
-
-        video_id, catalogue = self._search_regex(
-            r'["\'>]https?://videos\.francetv\.fr/video/([^@]+@.+?)["\'<]',
-            webpage, 'video id').split('@')
-
-        return self._make_url_result(video_id, catalogue)
-
-
-class FranceTVJeunesseIE(FranceTVBaseInfoExtractor):
-    _VALID_URL = r'(?P<url>https?://(?:www\.)?(?:zouzous|ludo)\.fr/heros/(?P<id>[^/?#&]+))'
-
-    _TESTS = [{
-        'url': 'https://www.zouzous.fr/heros/simon',
-        'info_dict': {
-            'id': 'simon',
-        },
-        'playlist_count': 9,
-    }, {
-        'url': 'https://www.ludo.fr/heros/ninjago',
-        'info_dict': {
-            'id': 'ninjago',
-        },
-        'playlist_count': 10,
-    }, {
-        'url': 'https://www.zouzous.fr/heros/simon?abc',
-        'only_matching': True,
-    }]
-
-    def _real_extract(self, url):
-        mobj = self._match_valid_url(url)
-        playlist_id = mobj.group('id')
-
-        playlist = self._download_json(
-            '%s/%s' % (mobj.group('url'), 'playlist'), playlist_id)
-
-        if not playlist.get('count'):
-            raise ExtractorError(
-                '%s is not available' % playlist_id, expected=True)
-
-        entries = []
-        for item in playlist['items']:
-            identity = item.get('identity')
-            if identity and isinstance(identity, compat_str):
-                entries.append(self._make_url_result(identity))
-
-        return self.playlist_result(entries, playlist_id)