]> jfr.im git - yt-dlp.git/blobdiff - yt_dlp/extractor/ard.py
[ie/ARDBetaMediathek] Fix series extraction (#8687)
[yt-dlp.git] / yt_dlp / extractor / ard.py
index 12a7cfb54a1fc891ce0e2d8565ddca8d511fbb31..8ac926c917fbaac4b5c7aa00a2386d72304193f0 100644 (file)
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
 import json
 import re
 
@@ -16,6 +13,7 @@
     try_get,
     unified_strdate,
     unified_timestamp,
+    update_url,
     update_url_query,
     url_or_none,
     xpath_text,
@@ -36,14 +34,12 @@ def _parse_media_info(self, media_info, video_id, fsk):
 
         if not formats:
             if fsk:
-                raise ExtractorError(
+                self.raise_no_formats(
                     'This video is only available after 20:00', expected=True)
             elif media_info.get('_geoblocked'):
                 self.raise_geo_restricted(
                     'This video is not available due to geoblocking',
-                    countries=self._GEO_COUNTRIES)
-
-        self._sort_formats(formats)
+                    countries=self._GEO_COUNTRIES, metadata_available=True)
 
         subtitles = {}
         subtitle_url = media_info.get('_subtitleUrl')
@@ -51,6 +47,9 @@ def _parse_media_info(self, media_info, video_id, fsk):
             subtitles['de'] = [{
                 'ext': 'ttml',
                 'url': subtitle_url,
+            }, {
+                'ext': 'vtt',
+                'url': subtitle_url.replace('/ebutt/', '/webvtt/') + '.vtt',
             }]
 
         return {
@@ -199,7 +198,7 @@ def suitable(cls, url):
 
     def _real_extract(self, url):
         # determine video id from url
-        m = re.match(self._VALID_URL, url)
+        m = self._match_valid_url(url)
 
         document_id = None
 
@@ -265,21 +264,21 @@ def _real_extract(self, url):
                     'format_id': fid,
                     'url': furl,
                 })
-            self._sort_formats(formats)
             info = {
                 'formats': formats,
             }
         else:  # request JSON file
             if not document_id:
                 video_id = self._search_regex(
-                    r'/play/(?:config|media)/(\d+)', webpage, 'media id')
+                    (r'/play/(?:config|media|sola)/(\d+)', r'contentId["\']\s*:\s*(\d+)'),
+                    webpage, 'media id', default=None)
             info = self._extract_media_info(
                 'http://www.ardmediathek.de/play/media/%s' % video_id,
                 webpage, video_id)
 
         info.update({
             'id': video_id,
-            'title': self._live_title(title) if info.get('is_live') else title,
+            'title': title,
             'description': description,
             'thumbnail': thumbnail,
         })
@@ -289,31 +288,43 @@ def _real_extract(self, url):
 
 
 class ARDIE(InfoExtractor):
-    _VALID_URL = r'(?P<mainurl>https?://(?:www\.)?daserste\.de/[^?#]+/videos(?:extern)?/(?P<display_id>[^/?#]+)-(?:video-?)?(?P<id>[0-9]+))\.html'
+    _VALID_URL = r'(?P<mainurl>https?://(?:www\.)?daserste\.de/(?:[^/?#&]+/)+(?P<id>[^/?#&]+))\.html'
     _TESTS = [{
-        # available till 7.01.2022
-        'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-die-woche-video100.html',
-        'md5': '867d8aa39eeaf6d76407c5ad1bb0d4c1',
+        # available till 7.12.2023
+        'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-video-424.html',
+        'md5': '94812e6438488fb923c361a44469614b',
         'info_dict': {
-            'display_id': 'maischberger-die-woche',
-            'id': '100',
+            'id': 'maischberger-video-424',
+            'display_id': 'maischberger-video-424',
             'ext': 'mp4',
-            'duration': 3687.0,
-            'title': 'maischberger. die woche vom 7. Januar 2021',
-            'upload_date': '20210107',
+            'duration': 4452.0,
+            'title': 'maischberger am 07.12.2022',
+            'upload_date': '20221207',
             'thumbnail': r're:^https?://.*\.jpg$',
         },
     }, {
-        'url': 'https://www.daserste.de/information/reportage-dokumentation/erlebnis-erde/videosextern/woelfe-und-herdenschutzhunde-ungleiche-brueder-102.html',
+        'url': 'https://www.daserste.de/information/politik-weltgeschehen/morgenmagazin/videosextern/dominik-kahun-aus-der-nhl-direkt-zur-weltmeisterschaft-100.html',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.daserste.de/information/nachrichten-wetter/tagesthemen/videosextern/tagesthemen-17736.html',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.daserste.de/unterhaltung/serie/in-aller-freundschaft-die-jungen-aerzte/videos/diversity-tag-sanam-afrashteh100.html',
         'only_matching': True,
     }, {
         'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html',
         'only_matching': True,
+    }, {
+        'url': 'https://www.daserste.de/unterhaltung/serie/in-aller-freundschaft-die-jungen-aerzte/Drehpause-100.html',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.daserste.de/unterhaltung/film/filmmittwoch-im-ersten/videos/making-ofwendezeit-video-100.html',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        display_id = mobj.group('display_id')
+        mobj = self._match_valid_url(url)
+        display_id = mobj.group('id')
 
         player_url = mobj.group('mainurl') + '~playerXml.xml'
         doc = self._download_xml(player_url, display_id)
@@ -361,11 +372,25 @@ def _real_extract(self, url):
                     continue
                 f['url'] = format_url
             formats.append(f)
-        self._sort_formats(formats)
+
+        _SUB_FORMATS = (
+            ('./dataTimedText', 'ttml'),
+            ('./dataTimedTextNoOffset', 'ttml'),
+            ('./dataTimedTextVtt', 'vtt'),
+        )
+
+        subtitles = {}
+        for subsel, subext in _SUB_FORMATS:
+            for node in video_node.findall(subsel):
+                subtitles.setdefault('de', []).append({
+                    'url': node.attrib['url'],
+                    'ext': subext,
+                })
 
         return {
-            'id': mobj.group('id'),
+            'id': xpath_text(video_node, './videoId', default=display_id),
             'formats': formats,
+            'subtitles': subtitles,
             'display_id': display_id,
             'title': video_node.find('./title').text,
             'duration': parse_duration(video_node.find('./duration').text),
@@ -375,8 +400,31 @@ def _real_extract(self, url):
 
 
 class ARDBetaMediathekIE(ARDMediathekBaseIE):
-    _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?P<client>[^/]+)/(?P<mode>player|live|video|sendung|sammlung)/(?P<display_id>(?:[^/]+/)*)(?P<video_id>[a-zA-Z0-9]+)'
+    _VALID_URL = r'''(?x)https://
+        (?:(?:beta|www)\.)?ardmediathek\.de/
+        (?:(?P<client>[^/]+)/)?
+        (?:player|live|video|(?P<playlist>sendung|serie|sammlung))/
+        (?:(?P<display_id>(?(playlist)[^?#]+?|[^?#]+))/)?
+        (?P<id>(?(playlist)|Y3JpZDovL)[a-zA-Z0-9]+)
+        (?(playlist)/(?P<season>\d+)?/?(?:[?#]|$))'''
+
     _TESTS = [{
+        'url': 'https://www.ardmediathek.de/video/filme-im-mdr/liebe-auf-vier-pfoten/mdr-fernsehen/Y3JpZDovL21kci5kZS9zZW5kdW5nLzI4MjA0MC80MjIwOTEtNDAyNTM0',
+        'md5': 'b6e8ab03f2bcc6e1f9e6cef25fcc03c4',
+        'info_dict': {
+            'display_id': 'filme-im-mdr/liebe-auf-vier-pfoten/mdr-fernsehen',
+            'id': '12939099',
+            'title': 'Liebe auf vier Pfoten',
+            'description': r're:^Claudia Schmitt, Anwältin in Salzburg',
+            'duration': 5222,
+            'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:aee7cbf8f06de976?w=960&ch=ae4d0f2ee47d8b9b',
+            'timestamp': 1701343800,
+            'upload_date': '20231130',
+            'ext': 'mp4',
+            'episode': 'Liebe auf vier Pfoten',
+            'series': 'Filme im MDR'
+        },
+    }, {
         'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/',
         'md5': 'a1dc75a39c61601b980648f7c9f9f71d',
         'info_dict': {
@@ -390,6 +438,23 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
             'upload_date': '20200805',
             'ext': 'mp4',
         },
+        'skip': 'Error',
+    }, {
+        'url': 'https://www.ardmediathek.de/video/tagesschau-oder-tagesschau-20-00-uhr/das-erste/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhZ2Vzc2NoYXUvZmM4ZDUxMjgtOTE0ZC00Y2MzLTgzNzAtNDZkNGNiZWJkOTll',
+        'md5': '1e73ded21cb79bac065117e80c81dc88',
+        'info_dict': {
+            'id': '10049223',
+            'ext': 'mp4',
+            'title': 'tagesschau, 20:00 Uhr',
+            'timestamp': 1636398000,
+            'description': 'md5:39578c7b96c9fe50afdf5674ad985e6b',
+            'upload_date': '20211108',
+            'display_id': 'tagesschau-oder-tagesschau-20-00-uhr/das-erste',
+            'duration': 915,
+            'episode': 'tagesschau, 20:00 Uhr',
+            'series': 'tagesschau',
+            'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:fbb21142783b0a49?w=960&ch=ee69108ae344f678',
+        },
     }, {
         'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE',
         'only_matching': True,
@@ -409,16 +474,27 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
         # playlist of type 'sendung'
         'url': 'https://www.ardmediathek.de/ard/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw/',
         'only_matching': True,
+    }, {
+        # playlist of type 'serie'
+        'url': 'https://www.ardmediathek.de/serie/nachtstreife/staffel-1/Y3JpZDovL3N3ci5kZS9zZGIvc3RJZC8xMjQy/1',
+        'only_matching': True,
     }, {
         # playlist of type 'sammlung'
         'url': 'https://www.ardmediathek.de/ard/sammlung/team-muenster/5JpTzLSbWUAK8184IOvEir/',
         'only_matching': True,
+    }, {
+        'url': 'https://www.ardmediathek.de/video/coronavirus-update-ndr-info/astrazeneca-kurz-lockdown-und-pims-syndrom-81/ndr/Y3JpZDovL25kci5kZS84NzE0M2FjNi0wMWEwLTQ5ODEtOTE5NS1mOGZhNzdhOTFmOTI/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3dkci5kZS9CZWl0cmFnLWQ2NDJjYWEzLTMwZWYtNGI4NS1iMTI2LTU1N2UxYTcxOGIzOQ/tatort-duo-koeln-leipzig-ihr-kinderlein-kommet',
+        'only_matching': True,
     }]
 
-    def _ARD_load_playlist_snipped(self, playlist_id, display_id, client, mode, pageNumber):
+    def _ARD_load_playlist_snippet(self, playlist_id, display_id, client, mode, page_number):
         """ Query the ARD server for playlist information
         and returns the data in "raw" format """
-        if mode == 'sendung':
+        assert mode in ('sendung', 'serie', 'sammlung')
+        if mode in ('sendung', 'serie'):
             graphQL = json.dumps({
                 'query': '''{
                     showPage(
@@ -435,7 +511,7 @@ def _ARD_load_playlist_snipped(self, playlist_id, display_id, client, mode, page
                             links { target { id href title } }
                             type
                         }
-                    }}''' % (client, playlist_id, pageNumber),
+                    }}''' % (client, playlist_id, page_number),
             }).encode()
         else:  # mode == 'sammlung'
             graphQL = json.dumps({
@@ -456,7 +532,7 @@ def _ARD_load_playlist_snipped(self, playlist_id, display_id, client, mode, page
                                 type
                             }
                         }
-                    }}''' % (client, playlist_id, pageNumber),
+                    }}''' % (client, playlist_id, page_number),
             }).encode()
         # Ressources for ARD graphQL debugging:
         # https://api-test.ardmediathek.de/public-gateway
@@ -466,7 +542,7 @@ def _ARD_load_playlist_snipped(self, playlist_id, display_id, client, mode, page
             data=graphQL,
             headers={'Content-Type': 'application/json'})['data']
         # align the structure of the returned data:
-        if mode == 'sendung':
+        if mode in ('sendung', 'serie'):
             show_page = show_page['showPage']
         else:  # mode == 'sammlung'
             show_page = show_page['morePage']['widget']
@@ -474,12 +550,12 @@ def _ARD_load_playlist_snipped(self, playlist_id, display_id, client, mode, page
 
     def _ARD_extract_playlist(self, url, playlist_id, display_id, client, mode):
         """ Collects all playlist entries and returns them as info dict.
-        Supports playlists of mode 'sendung' and 'sammlung', and also nested
-        playlists. """
+        Supports playlists of mode 'sendung', 'serie', and 'sammlung',
+        as well as nested playlists. """
         entries = []
         pageNumber = 0
         while True:  # iterate by pageNumber
-            show_page = self._ARD_load_playlist_snipped(
+            show_page = self._ARD_load_playlist_snippet(
                 playlist_id, display_id, client, mode, pageNumber)
             for teaser in show_page['teasers']:  # process playlist items
                 if '/compilation/' in teaser['links']['target']['href']:
@@ -509,23 +585,16 @@ def _ARD_extract_playlist(self, url, playlist_id, display_id, client, mode):
                 break
             pageNumber = pageNumber + 1
 
-        return self.playlist_result(entries, playlist_title=display_id)
+        return self.playlist_result(entries, playlist_id, playlist_title=display_id)
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('video_id')
-        display_id = mobj.group('display_id')
-        if display_id:
-            display_id = display_id.rstrip('/')
-        if not display_id:
-            display_id = video_id
-
-        if mobj.group('mode') in ('sendung', 'sammlung'):
-            # this is a playlist-URL
-            return self._ARD_extract_playlist(
-                url, video_id, display_id,
-                mobj.group('client'),
-                mobj.group('mode'))
+        video_id, display_id, playlist_type, client, season_number = self._match_valid_url(url).group(
+            'id', 'display_id', 'playlist', 'client', 'season')
+        display_id, client = display_id or video_id, client or 'ard'
+
+        if playlist_type:
+            # TODO: Extract only specified season
+            return self._ARD_extract_playlist(url, video_id, display_id, client, playlist_type)
 
         player_page = self._download_json(
             'https://api.ardmediathek.de/public-gateway',
@@ -553,6 +622,9 @@ def _real_extract(self, url):
     show {
       title
     }
+    image {
+      src
+    }
     synopsis
     title
     tracking {
@@ -561,7 +633,7 @@ def _real_extract(self, url):
       }
     }
   }
-}''' % (mobj.group('client'), video_id),
+}''' % (client, video_id),
             }).encode(), headers={
                 'Content-Type': 'application/json'
             })['data']['playerPage']
@@ -591,6 +663,15 @@ def _real_extract(self, url):
             'description': description,
             'timestamp': unified_timestamp(player_page.get('broadcastedOn')),
             'series': try_get(player_page, lambda x: x['show']['title']),
+            'thumbnail': (media_collection.get('_previewImage')
+                          or try_get(player_page, lambda x: update_url(x['image']['src'], query=None, fragment=None))
+                          or self.get_thumbnail_from_html(display_id, url)),
         })
         info.update(self._ARD_extract_episode_info(info['title']))
         return info
+
+    def get_thumbnail_from_html(self, display_id, url):
+        webpage = self._download_webpage(url, display_id, fatal=False) or ''
+        return (
+            self._og_search_thumbnail(webpage, default=None)
+            or self._html_search_meta('thumbnailUrl', webpage, default=None))