]> jfr.im git - yt-dlp.git/blobdiff - yt_dlp/extractor/francetv.py
[utils] Add `parse_qs`
[yt-dlp.git] / yt_dlp / extractor / francetv.py
index ab0df1bedd646d14f35dc0a11c8052e652b8a904..41910cefb1f6b03a20bf86fc3c4259e05dd206d3 100644 (file)
@@ -2,12 +2,10 @@
 
 from __future__ import unicode_literals
 
-import re
 
 from .common import InfoExtractor
 from ..compat import (
     compat_str,
-    compat_urlparse,
 )
 from ..utils import (
     clean_html,
@@ -15,6 +13,7 @@
     ExtractorError,
     int_or_none,
     parse_duration,
+    parse_qs,
     try_get,
     url_or_none,
     urljoin,
@@ -151,6 +150,7 @@ def sign(manifest_url, manifest_id):
                     videos.append(fallback_info['video'])
 
         formats = []
+        subtitles = {}
         for video in videos:
             video_url = video.get('url')
             if not video_url:
@@ -171,10 +171,12 @@ def sign(manifest_url, manifest_id):
                     sign(video_url, format_id) + '&hdcore=3.7.0&plugin=aasp-3.7.0.39.44',
                     video_id, f4m_id=format_id, fatal=False))
             elif ext == 'm3u8':
-                formats.extend(self._extract_m3u8_formats(
+                m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
                     sign(video_url, format_id), video_id, 'mp4',
                     entry_protocol='m3u8_native', m3u8_id=format_id,
-                    fatal=False))
+                    fatal=False)
+                formats.extend(m3u8_fmts)
+                subtitles = self._merge_subtitles(subtitles, m3u8_subs)
             elif ext == 'mpd':
                 formats.extend(self._extract_mpd_formats(
                     sign(video_url, format_id), video_id, mpd_id=format_id, fatal=False))
@@ -199,13 +201,12 @@ def sign(manifest_url, manifest_id):
             title += ' - %s' % subtitle
         title = title.strip()
 
-        subtitles = {}
-        subtitles_list = [{
-            'url': subformat['url'],
-            'ext': subformat.get('format'),
-        } for subformat in info.get('subtitles', []) if subformat.get('url')]
-        if subtitles_list:
-            subtitles['fr'] = subtitles_list
+        subtitles.setdefault('fr', []).extend(
+            [{
+                'url': subformat['url'],
+                'ext': subformat.get('format'),
+            } for subformat in info.get('subtitles', []) if subformat.get('url')]
+        )
 
         return {
             'id': video_id,
@@ -220,12 +221,12 @@ def sign(manifest_url, manifest_id):
         }
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
+        mobj = self._match_valid_url(url)
         video_id = mobj.group('id')
         catalog = mobj.group('catalog')
 
         if not video_id:
-            qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+            qs = parse_qs(url)
             video_id = qs.get('idDiffusion', [None])[0]
             catalog = qs.get('catalogue', [None])[0]
             if not video_id:
@@ -357,6 +358,22 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor):
             'skip_download': True,
         },
         'add_ie': [FranceTVIE.ie_key()],
+    }, {
+        'note': 'Only an image exists in initial webpage instead of the video',
+        'url': 'https://www.francetvinfo.fr/sante/maladie/coronavirus/covid-19-en-inde-une-situation-catastrophique-a-new-dehli_4381095.html',
+        'info_dict': {
+            'id': '7d204c9e-a2d3-11eb-9e4c-000d3a23d482',
+            'ext': 'mp4',
+            'title': 'Covid-19 : une situation catastrophique à New Dehli',
+            'thumbnail': str,
+            'duration': 76,
+            'timestamp': 1619028518,
+            'upload_date': '20210421',
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'add_ie': [FranceTVIE.ie_key()],
     }, {
         'url': 'http://www.francetvinfo.fr/elections/europeennes/direct-europeennes-regardez-le-debat-entre-les-candidats-a-la-presidence-de-la-commission_600639.html',
         'only_matching': True,
@@ -384,6 +401,10 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor):
     }, {
         'url': 'http://france3-regions.francetvinfo.fr/limousin/emissions/jt-1213-limousin',
         'only_matching': True,
+    }, {
+        # "<figure id=" pattern (#28792)
+        'url': 'https://www.francetvinfo.fr/culture/patrimoine/incendie-de-notre-dame-de-paris/notre-dame-de-paris-de-l-incendie-de-la-cathedrale-a-sa-reconstruction_4372291.html',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
@@ -401,7 +422,7 @@ def _real_extract(self, url):
             (r'player\.load[^;]+src:\s*["\']([^"\']+)',
              r'id-video=([^@]+@[^"]+)',
              r'<a[^>]+href="(?:https?:)?//videos\.francetv\.fr/video/([^@]+@[^"]+)"',
-             r'data-id="([^"]+)"'),
+             r'(?:data-id|<figure[^<]+\bid)=["\']([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'),
             webpage, 'video id')
 
         return self._make_url_result(video_id)
@@ -524,7 +545,7 @@ class FranceTVJeunesseIE(FranceTVBaseInfoExtractor):
     }]
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
+        mobj = self._match_valid_url(url)
         playlist_id = mobj.group('id')
 
         playlist = self._download_json(