]> jfr.im git - yt-dlp.git/commitdiff
[VideocampusSachsen] Improve extractor (#3604)
authorFestplattenSchnitzel <redacted>
Thu, 5 May 2022 17:31:54 +0000 (19:31 +0200)
committerGitHub <redacted>
Thu, 5 May 2022 17:31:54 +0000 (10:31 -0700)
Authored by: FestplattenSchnitzel

yt_dlp/extractor/extractors.py
yt_dlp/extractor/videocampus_sachsen.py

index 2c09a161ec0405cb6fbebff6aeb96f8b783cae9e..6f6862915a52099c715b1f20276026dae2d27d3f 100644 (file)
 from .vidbit import VidbitIE
 from .viddler import ViddlerIE
 from .videa import VideaIE
-from .videocampus_sachsen import (
-    VideocampusSachsenIE,
-    VideocampusSachsenEmbedIE,
-)
+from .videocampus_sachsen import VideocampusSachsenIE
 from .videodetective import VideoDetectiveIE
 from .videofyme import VideofyMeIE
 from .videomore import (
index fe9e061ae2e4842e78e5bfbf76f040a20113df5e..906412f08df4b64fdb1a884f70d13427e229a2a9 100644 (file)
@@ -1,11 +1,70 @@
+import re
+
 from .common import InfoExtractor
+from ..compat import compat_HTTPError
+from ..utils import ExtractorError
 
 
 class VideocampusSachsenIE(InfoExtractor):
-    _VALID_URL = r'''(?x)https?://videocampus\.sachsen\.de/(?:
+    IE_NAME = 'Vimp'
+    _INSTANCES = (
+        'campus.demo.vimp.com',
+        'corporate.demo.vimp.com',
+        'dancehalldatabase.com',
+        'educhannel.hs-gesundheit.de',
+        'emedia.ls.haw-hamburg.de',
+        'globale-evolution.net',
+        'k210039.vimp.mivitec.net',
+        'media.cmslegal.com',
+        'media.hs-furtwangen.de',
+        'media.hwr-berlin.de',
+        'mediathek.dkfz.de',
+        'mediathek.htw-berlin.de',
+        'mediathek.polizei-bw.de',
+        'medien.hs-merseburg.de',
+        'mportal.europa-uni.de',
+        'pacific.demo.vimp.com',
+        'slctv.com',
+        'tube.isbonline.cn',
+        'univideo.uni-kassel.de',
+        'ursula2.genetics.emory.edu',
+        'ursulablicklevideoarchiv.com',
+        'v.agrarumweltpaedagogik.at',
+        'video.eplay-tv.de',
+        'video.fh-dortmund.de',
+        'video.hs-offenburg.de',
+        'video.hs-pforzheim.de',
+        'video.hspv.nrw.de',
+        'video.irtshdf.fr',
+        'video.pareygo.de',
+        'video.tu-freiberg.de',
+        'videocampus.sachsen.de',
+        'videoportal.uni-freiburg.de',
+        'videoportal.vm.uni-freiburg.de',
+        'videos.duoc.cl',
+        'videos.uni-paderborn.de',
+        'vimp-bemus.udk-berlin.de',
+        'vimp.aekwl.de',
+        'vimp.hs-mittweida.de',
+        'vimp.oth-regensburg.de',
+        'vimp.ph-heidelberg.de',
+        'vimp.sma-events.com',
+        'vimp.weka-fachmedien.de',
+        'webtv.univ-montp3.fr',
+        'www.b-tu.de/media',
+        'www.bigcitytv.de',
+        'www.cad-videos.de',
+        'www.fh-bielefeld.de/medienportal',
+        'www.orvovideo.com',
+        'www.rwe.tv',
+        'www.wenglor-media.com',
+        'www2.univ-sba.dz',
+    )
+    _VALID_URL = r'''(?x)https?://(?P<host>%s)/(?:
         m/(?P<tmp_id>[0-9a-f]+)|
-        (?:category/)?video/(?P<display_id>[\w-]+)/(?P<id>[0-9a-f]{32})
-    )'''
+        (?:category/)?video/(?P<display_id>[\w-]+)/(?P<id>[0-9a-f]{32})|
+        media/embed.*(?:\?|&)key=(?P<embed_id>[0-9a-f]{32}&?)
+    )''' % ('|'.join(map(re.escape, _INSTANCES)))
 
     _TESTS = [
         {
@@ -13,6 +72,7 @@ class VideocampusSachsenIE(InfoExtractor):
             'info_dict': {
                 'id': 'e6b9349905c1628631f175712250f2a1',
                 'title': 'Konstruktiver Entwicklungsprozess Vorlesung 7',
+                'description': 'Konstruktiver Entwicklungsprozess Vorlesung 7',
                 'ext': 'mp4',
             },
         },
@@ -21,6 +81,7 @@ class VideocampusSachsenIE(InfoExtractor):
             'info_dict': {
                 'id': 'fc99c527e4205b121cb7c74433469262',
                 'title': 'Was ist selbstgesteuertes Lernen?',
+                'description': 'md5:196aa3b0509a526db62f84679522a2f5',
                 'display_id': 'Was-ist-selbstgesteuertes-Lernen',
                 'ext': 'mp4',
             },
@@ -30,43 +91,32 @@ class VideocampusSachsenIE(InfoExtractor):
             'info_dict': {
                 'id': '09d4ed029002eb1bdda610f1103dd54c',
                 'title': 'Tutorial zur Nutzung von Adobe Connect aus Veranstalter-Sicht',
+                'description': 'md5:3d379ca3cc17b9da6784d7f58cca4d58',
                 'display_id': 'Tutorial-zur-Nutzung-von-Adobe-Connect-aus-Veranstalter-Sicht',
                 'ext': 'mp4',
             },
         },
-    ]
-
-    def _real_extract(self, url):
-        video_id, tmp_id, display_id = self._match_valid_url(url).group('id', 'tmp_id', 'display_id')
-        webpage = self._download_webpage(url, video_id or tmp_id, fatal=False) or ''
-
-        if not tmp_id:
-            video_id = self._html_search_regex(
-                r'src="https?://videocampus\.sachsen\.de/media/embed\?key=([0-9a-f]+)&',
-                webpage, 'video_id')
-
-        title = self._html_search_regex(
-            (r'<h1>(?P<content>[^<]+)</h1>', *self._meta_regex('title')),
-            webpage, 'title', group='content', fatal=False)
-
-        formats, subtitles = self._extract_m3u8_formats_and_subtitles(
-            f'https://videocampus.sachsen.de/media/hlsMedium/key/{video_id}/format/auto/ext/mp4/learning/0/path/m3u8',
-            video_id, 'mp4', 'm3u8_native', m3u8_id='hls')
-        self._sort_formats(formats)
-
-        return {
-            'id': video_id,
-            'title': title,
-            'display_id': display_id,
-            'formats': formats,
-            'subtitles': subtitles
-        }
-
-
-class VideocampusSachsenEmbedIE(InfoExtractor):
-    _VALID_URL = r'https?://videocampus.sachsen.de/media/embed\?key=(?P<id>[0-9a-f]+)'
-
-    _TESTS = [
+        {
+            'url': 'https://www2.univ-sba.dz/video/Presentation-de-la-Faculte-de-droit-et-des-sciences-politiques-Journee-portes-ouvertes-202122/0183356e41af7bfb83d7667b20d9b6a3',
+            'info_dict': {
+                'url': 'https://www2.univ-sba.dz/getMedium/0183356e41af7bfb83d7667b20d9b6a3.mp4',
+                'id': '0183356e41af7bfb83d7667b20d9b6a3',
+                'title': 'Présentation de la Faculté de droit et des sciences politiques - Journée portes ouvertes 2021/22',
+                'description': 'md5:508958bd93e0ca002ac731d94182a54f',
+                'display_id': 'Presentation-de-la-Faculte-de-droit-et-des-sciences-politiques-Journee-portes-ouvertes-202122',
+                'ext': 'mp4',
+            }
+        },
+        {
+            'url': 'https://vimp.weka-fachmedien.de/video/Preisverleihung-Produkte-des-Jahres-2022/c8816f1cc942c12b6cce57c835cffd7c',
+            'info_dict': {
+                'id': 'c8816f1cc942c12b6cce57c835cffd7c',
+                'title': 'Preisverleihung »Produkte des Jahres 2022«',
+                'description': 'md5:60c347568ca89aa25b772c4ea564ebd3',
+                'display_id': 'Preisverleihung-Produkte-des-Jahres-2022',
+                'ext': 'mp4',
+            },
+        },
         {
             'url': 'https://videocampus.sachsen.de/media/embed?key=fc99c527e4205b121cb7c74433469262',
             'info_dict': {
@@ -78,18 +128,41 @@ class VideocampusSachsenEmbedIE(InfoExtractor):
     ]
 
     def _real_extract(self, url):
-        video_id = self._match_id(url)
+        host, video_id, tmp_id, display_id, embed_id = self._match_valid_url(url).group(
+            'host', 'id', 'tmp_id', 'display_id', 'embed_id')
+        webpage = self._download_webpage(url, video_id or tmp_id, fatal=False) or ''
+
+        if not video_id:
+            video_id = embed_id or self._html_search_regex(
+                rf'src="https?://{host}/media/embed.*(?:\?|&)key=([0-9a-f]+)&?',
+                webpage, 'video_id')
 
-        webpage = self._download_webpage(url, video_id)
-        title = self._html_search_regex(r'<img[^>]*title="([^"<]+)"', webpage, 'title', fatal=False)
-        formats, subtitles = self._extract_m3u8_formats_and_subtitles(
-            f'https://videocampus.sachsen.de/media/hlsMedium/key/{video_id}/format/auto/ext/mp4/learning/0/path/m3u8',
-            video_id, 'mp4', 'm3u8_native', m3u8_id='hls')
+        if not (display_id or tmp_id):
+            # Title, description from embedded page's meta wouldn't be correct
+            title = self._html_search_regex(r'<img[^>]* title="([^"<]+)"', webpage, 'title', fatal=False)
+            description = None
+        else:
+            title = self._html_search_meta(('og:title', 'twitter:title', 'title'), webpage, fatal=False)
+            description = self._html_search_meta(
+                ('og:description', 'twitter:description', 'description'), webpage, default=None)
+
+        formats, subtitles = [], {}
+        try:
+            formats, subtitles = self._extract_m3u8_formats_and_subtitles(
+                f'https://{host}/media/hlsMedium/key/{video_id}/format/auto/ext/mp4/learning/0/path/m3u8',
+                video_id, 'mp4', m3u8_id='hls', fatal=True)
+        except ExtractorError as e:
+            if not isinstance(e.cause, compat_HTTPError) or e.cause.code not in (404, 500):
+                raise
+
+        formats.append({'url': f'https://{host}/getMedium/{video_id}.mp4'})
         self._sort_formats(formats)
 
         return {
             'id': video_id,
             'title': title,
+            'description': description,
+            'display_id': display_id,
             'formats': formats,
-            'subtitles': subtitles,
+            'subtitles': subtitles
         }