]> jfr.im git - yt-dlp.git/blobdiff - yt_dlp/extractor/arte.py
[generic] Extract subtitles from video.js (#3156)
[yt-dlp.git] / yt_dlp / extractor / arte.py
index 296b169d2a844dd87b0f274464d0dd14cb6ebc4d..c2f2c1bd3c9fff6d2088909a0dd218f90ffb2d04 100644 (file)
@@ -12,6 +12,7 @@
     int_or_none,
     parse_qs,
     qualities,
+    strip_or_none,
     try_get,
     unified_strdate,
     url_or_none,
@@ -137,6 +138,7 @@ def _real_extract(self, url):
                     break
             else:
                 lang_pref = -1
+            format_note = '%s, %s' % (f.get('versionCode'), f.get('versionLibelle'))
 
             media_type = f.get('mediaType')
             if media_type == 'hls':
@@ -144,14 +146,17 @@ def _real_extract(self, url):
                     format_url, video_id, 'mp4', entry_protocol='m3u8_native',
                     m3u8_id=format_id, fatal=False)
                 for m3u8_format in m3u8_formats:
-                    m3u8_format['language_preference'] = lang_pref
+                    m3u8_format.update({
+                        'language_preference': lang_pref,
+                        'format_note': format_note,
+                    })
                 formats.extend(m3u8_formats)
                 continue
 
             format = {
                 'format_id': format_id,
                 'language_preference': lang_pref,
-                'format_note': '%s, %s' % (f.get('versionCode'), f.get('versionLibelle')),
+                'format_note': format_note,
                 'width': int_or_none(f.get('width')),
                 'height': int_or_none(f.get('height')),
                 'tbr': int_or_none(f.get('bitrate')),
@@ -253,3 +258,44 @@ def _real_extract(self, url):
         title = collection.get('title')
         description = collection.get('shortDescription') or collection.get('teaserText')
         return self.playlist_result(entries, playlist_id, title, description)
+
+
+class ArteTVCategoryIE(ArteTVBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE._ARTE_LANGUAGES
+    _TESTS = [{
+        'url': 'https://www.arte.tv/en/videos/politics-and-society/',
+        'info_dict': {
+            'id': 'politics-and-society',
+            'title': 'Politics and society',
+            'description': 'Investigative documentary series, geopolitical analysis, and international commentary',
+        },
+        'playlist_mincount': 13,
+    },
+    ]
+
+    @classmethod
+    def suitable(cls, url):
+        return (
+            not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, ))
+            and super(ArteTVCategoryIE, cls).suitable(url))
+
+    def _real_extract(self, url):
+        lang, playlist_id = self._match_valid_url(url).groups()
+        webpage = self._download_webpage(url, playlist_id)
+
+        items = []
+        for video in re.finditer(
+                r'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|\b)(?P<url>https?://www\.arte\.tv/%s/videos/[\w/-]+)(?P=q)' % lang,
+                webpage):
+            video = video.group('url')
+            if video == url:
+                continue
+            if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )):
+                items.append(video)
+
+        title = (self._og_search_title(webpage, default=None)
+                 or self._html_search_regex(r'<title\b[^>]*>([^<]+)</title>', default=None))
+        title = strip_or_none(title.rsplit('|', 1)[0]) or self._generic_title(url)
+
+        return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title,
+                                          description=self._og_search_description(webpage, default=None))