-# coding: utf-8
-from __future__ import unicode_literals
-
import json
import re
try_get,
unified_strdate,
unified_timestamp,
+ update_url,
update_url_query,
url_or_none,
xpath_text,
'This video is not available due to geoblocking',
countries=self._GEO_COUNTRIES, metadata_available=True)
- self._sort_formats(formats)
-
subtitles = {}
subtitle_url = media_info.get('_subtitleUrl')
if subtitle_url:
subtitles['de'] = [{
'ext': 'ttml',
'url': subtitle_url,
+ }, {
+ 'ext': 'vtt',
+ 'url': subtitle_url.replace('/ebutt/', '/webvtt/') + '.vtt',
}]
return {
def _real_extract(self, url):
# determine video id from url
- m = re.match(self._VALID_URL, url)
+ m = self._match_valid_url(url)
document_id = None
'format_id': fid,
'url': furl,
})
- self._sort_formats(formats)
info = {
'formats': formats,
}
info.update({
'id': video_id,
- 'title': self._live_title(title) if info.get('is_live') else title,
+ 'title': title,
'description': description,
'thumbnail': thumbnail,
})
class ARDIE(InfoExtractor):
- _VALID_URL = r'(?P<mainurl>https?://(?:www\.)?daserste\.de/[^?#]+/videos(?:extern)?/(?P<display_id>[^/?#]+)-?(?:video-?)?(?P<id>[0-9]+))\.html'
+ _VALID_URL = r'(?P<mainurl>https?://(?:www\.)?daserste\.de/(?:[^/?#&]+/)+(?P<id>[^/?#&]+))\.html'
_TESTS = [{
- # available till 7.01.2022
- 'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-die-woche-video100.html',
- 'md5': '867d8aa39eeaf6d76407c5ad1bb0d4c1',
+ # available till 7.12.2023
+ 'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-video-424.html',
+ 'md5': 'a438f671e87a7eba04000336a119ccc4',
'info_dict': {
- 'display_id': 'maischberger-die-woche',
- 'id': '100',
+ 'id': 'maischberger-video-424',
+ 'display_id': 'maischberger-video-424',
'ext': 'mp4',
- 'duration': 3687.0,
- 'title': 'maischberger. die woche vom 7. Januar 2021',
- 'upload_date': '20210107',
+ 'duration': 4452.0,
+ 'title': 'maischberger am 07.12.2022',
+ 'upload_date': '20221207',
'thumbnail': r're:^https?://.*\.jpg$',
},
}, {
- 'url': 'https://www.daserste.de/information/reportage-dokumentation/erlebnis-erde/videosextern/woelfe-und-herdenschutzhunde-ungleiche-brueder-102.html',
+ 'url': 'https://www.daserste.de/information/politik-weltgeschehen/morgenmagazin/videosextern/dominik-kahun-aus-der-nhl-direkt-zur-weltmeisterschaft-100.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.daserste.de/information/nachrichten-wetter/tagesthemen/videosextern/tagesthemen-17736.html',
'only_matching': True,
}, {
'url': 'https://www.daserste.de/unterhaltung/serie/in-aller-freundschaft-die-jungen-aerzte/videos/diversity-tag-sanam-afrashteh100.html',
}, {
'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html',
'only_matching': True,
+ }, {
+ 'url': 'https://www.daserste.de/unterhaltung/serie/in-aller-freundschaft-die-jungen-aerzte/Drehpause-100.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.daserste.de/unterhaltung/film/filmmittwoch-im-ersten/videos/making-ofwendezeit-video-100.html',
+ 'only_matching': True,
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- display_id = mobj.group('display_id')
+ mobj = self._match_valid_url(url)
+ display_id = mobj.group('id')
player_url = mobj.group('mainurl') + '~playerXml.xml'
doc = self._download_xml(player_url, display_id)
continue
f['url'] = format_url
formats.append(f)
- self._sort_formats(formats)
+
+ _SUB_FORMATS = (
+ ('./dataTimedText', 'ttml'),
+ ('./dataTimedTextNoOffset', 'ttml'),
+ ('./dataTimedTextVtt', 'vtt'),
+ )
+
+ subtitles = {}
+ for subsel, subext in _SUB_FORMATS:
+ for node in video_node.findall(subsel):
+ subtitles.setdefault('de', []).append({
+ 'url': node.attrib['url'],
+ 'ext': subext,
+ })
return {
- 'id': mobj.group('id'),
+ 'id': xpath_text(video_node, './videoId', default=display_id),
'formats': formats,
+ 'subtitles': subtitles,
'display_id': display_id,
'title': video_node.find('./title').text,
'duration': parse_duration(video_node.find('./duration').text),
class ARDBetaMediathekIE(ARDMediathekBaseIE):
- _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?P<client>[^/]+)/(?P<mode>player|live|video|sendung|sammlung)/(?P<display_id>(?:[^/]+/)*)(?P<video_id>[a-zA-Z0-9]+)'
+ _VALID_URL = r'''(?x)https://
+ (?:(?:beta|www)\.)?ardmediathek\.de/
+ (?:(?P<client>[^/]+)/)?
+ (?:player|live|video|(?P<playlist>sendung|sammlung))/
+ (?:(?P<display_id>(?(playlist)[^?#]+?|[^?#]+))/)?
+ (?P<id>(?(playlist)|Y3JpZDovL)[a-zA-Z0-9]+)
+ (?(playlist)/(?P<season>\d+)?/?(?:[?#]|$))'''
+
_TESTS = [{
+ 'url': 'https://www.ardmediathek.de/video/filme-im-mdr/wolfsland-die-traurigen-schwestern/mdr-fernsehen/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy8xZGY0ZGJmZS00ZWQwLTRmMGItYjhhYy0wOGQ4ZmYxNjVhZDI',
+ 'md5': '3fd5fead7a370a819341129c8d713136',
+ 'info_dict': {
+ 'display_id': 'filme-im-mdr/wolfsland-die-traurigen-schwestern/mdr-fernsehen',
+ 'id': '12172961',
+ 'title': 'Wolfsland - Die traurigen Schwestern',
+ 'description': r're:^Als der Polizeiobermeister Raaben',
+ 'duration': 5241,
+ 'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:efa186f7b0054957',
+ 'timestamp': 1670710500,
+ 'upload_date': '20221210',
+ 'ext': 'mp4',
+ 'age_limit': 12,
+ 'episode': 'Wolfsland - Die traurigen Schwestern',
+ 'series': 'Filme im MDR'
+ },
+ }, {
'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/',
'md5': 'a1dc75a39c61601b980648f7c9f9f71d',
'info_dict': {
'upload_date': '20200805',
'ext': 'mp4',
},
+ 'skip': 'Error',
+ }, {
+ 'url': 'https://www.ardmediathek.de/video/tagesschau-oder-tagesschau-20-00-uhr/das-erste/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhZ2Vzc2NoYXUvZmM4ZDUxMjgtOTE0ZC00Y2MzLTgzNzAtNDZkNGNiZWJkOTll',
+ 'md5': '1e73ded21cb79bac065117e80c81dc88',
+ 'info_dict': {
+ 'id': '10049223',
+ 'ext': 'mp4',
+ 'title': 'tagesschau, 20:00 Uhr',
+ 'timestamp': 1636398000,
+ 'description': 'md5:39578c7b96c9fe50afdf5674ad985e6b',
+ 'upload_date': '20211108',
+ 'display_id': 'tagesschau-oder-tagesschau-20-00-uhr/das-erste',
+ 'duration': 915,
+ 'episode': 'tagesschau, 20:00 Uhr',
+ 'series': 'tagesschau',
+ 'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:fbb21142783b0a49',
+ },
}, {
'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE',
'only_matching': True,
# playlist of type 'sammlung'
'url': 'https://www.ardmediathek.de/ard/sammlung/team-muenster/5JpTzLSbWUAK8184IOvEir/',
'only_matching': True,
+ }, {
+ 'url': 'https://www.ardmediathek.de/video/coronavirus-update-ndr-info/astrazeneca-kurz-lockdown-und-pims-syndrom-81/ndr/Y3JpZDovL25kci5kZS84NzE0M2FjNi0wMWEwLTQ5ODEtOTE5NS1mOGZhNzdhOTFmOTI/',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3dkci5kZS9CZWl0cmFnLWQ2NDJjYWEzLTMwZWYtNGI4NS1iMTI2LTU1N2UxYTcxOGIzOQ/tatort-duo-koeln-leipzig-ihr-kinderlein-kommet',
+ 'only_matching': True,
}]
def _ARD_load_playlist_snipped(self, playlist_id, display_id, client, mode, pageNumber):
break
pageNumber = pageNumber + 1
- return self.playlist_result(entries, playlist_title=display_id)
+ return self.playlist_result(entries, playlist_id, playlist_title=display_id)
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('video_id')
- display_id = mobj.group('display_id')
- if display_id:
- display_id = display_id.rstrip('/')
- if not display_id:
- display_id = video_id
-
- if mobj.group('mode') in ('sendung', 'sammlung'):
- # this is a playlist-URL
- return self._ARD_extract_playlist(
- url, video_id, display_id,
- mobj.group('client'),
- mobj.group('mode'))
+ video_id, display_id, playlist_type, client, season_number = self._match_valid_url(url).group(
+ 'id', 'display_id', 'playlist', 'client', 'season')
+ display_id, client = display_id or video_id, client or 'ard'
+
+ if playlist_type:
+ # TODO: Extract only specified season
+ return self._ARD_extract_playlist(url, video_id, display_id, client, playlist_type)
player_page = self._download_json(
'https://api.ardmediathek.de/public-gateway',
show {
title
}
+ image {
+ src
+ }
synopsis
title
tracking {
}
}
}
-}''' % (mobj.group('client'), video_id),
+}''' % (client, video_id),
}).encode(), headers={
'Content-Type': 'application/json'
})['data']['playerPage']
'description': description,
'timestamp': unified_timestamp(player_page.get('broadcastedOn')),
'series': try_get(player_page, lambda x: x['show']['title']),
+ 'thumbnail': (media_collection.get('_previewImage')
+ or try_get(player_page, lambda x: update_url(x['image']['src'], query=None, fragment=None))
+ or self.get_thumbnail_from_html(display_id, url)),
})
info.update(self._ARD_extract_episode_info(info['title']))
return info
+
+ def get_thumbnail_from_html(self, display_id, url):
+ webpage = self._download_webpage(url, display_id, fatal=False) or ''
+ return (
+ self._og_search_thumbnail(webpage, default=None)
+ or self._html_search_meta('thumbnailUrl', webpage, default=None))