youtube_dl/extractor/ard.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import re
   5
   6 from .common import InfoExtractor
   7 from .generic import GenericIE
   8 from ..utils import (
   9     determine_ext,
  10     ExtractorError,
  11     qualities,
  12     int_or_none,
  13     parse_duration,
  14     unified_strdate,
  15     xpath_text,
  16     parse_xml,
  17 )
  18
  19
  20 class ARDMediathekIE(InfoExtractor):
  21     IE_NAME = 'ARD:mediathek'
  22     _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'
  23
  24     _TESTS = [{
  25         'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht',
  26         'only_matching': True,
  27     }, {
  28         'url': 'http://www.ardmediathek.de/tv/Tatort/Das-Wunder-von-Wolbeck-Video-tgl-ab-20/Das-Erste/Video?documentId=22490580&bcastId=602916',
  29         'info_dict': {
  30             'id': '22490580',
  31             'ext': 'mp4',
  32             'title': 'Das Wunder von Wolbeck (Video tgl. ab 20 Uhr)',
  33             'description': 'Auf einem restaurierten Hof bei Wolbeck wird der Heilpraktiker Raffael Lembeck eines morgens von seiner Frau Stella tot aufgefunden. Das Opfer war offensichtlich in seiner Praxis zu Fall gekommen und ist dann verblutet, erklärt Prof. Boerne am Tatort.',
  34         },
  35         'skip': 'Blocked outside of Germany',
  36     }]
  37
  38     def _extract_media_info(self, media_info_url, webpage, video_id):
  39         media_info = self._download_json(
  40             media_info_url, video_id, 'Downloading media JSON')
  41
  42         formats = self._extract_formats(media_info, video_id)
  43
  44         if not formats:
  45             if '"fsk"' in webpage:
  46                 raise ExtractorError(
  47                     'This video is only available after 20:00', expected=True)
  48             elif media_info.get('_geoblocked'):
  49                 raise ExtractorError('This video is not available due to geo restriction', expected=True)
  50
  51         self._sort_formats(formats)
  52
  53         duration = int_or_none(media_info.get('_duration'))
  54         thumbnail = media_info.get('_previewImage')
  55
  56         subtitles = {}
  57         subtitle_url = media_info.get('_subtitleUrl')
  58         if subtitle_url:
  59             subtitles['de'] = [{
  60                 'ext': 'srt',
  61                 'url': subtitle_url,
  62             }]
  63
  64         return {
  65             'id': video_id,
  66             'duration': duration,
  67             'thumbnail': thumbnail,
  68             'formats': formats,
  69             'subtitles': subtitles,
  70         }
  71
  72     def _extract_formats(self, media_info, video_id):
  73         type_ = media_info.get('_type')
  74         media_array = media_info.get('_mediaArray', [])
  75         formats = []
  76         for num, media in enumerate(media_array):
  77             for stream in media.get('_mediaStreamArray', []):
  78                 stream_urls = stream.get('_stream')
  79                 if not stream_urls:
  80                     continue
  81                 if not isinstance(stream_urls, list):
  82                     stream_urls = [stream_urls]
  83                 quality = stream.get('_quality')
  84                 server = stream.get('_server')
  85                 for stream_url in stream_urls:
  86                     ext = determine_ext(stream_url)
  87                     if ext == 'f4m':
  88                         formats.extend(self._extract_f4m_formats(
  89                             stream_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124',
  90                             video_id, preference=-1, f4m_id='hds'))
  91                     elif ext == 'm3u8':
  92                         formats.extend(self._extract_m3u8_formats(
  93                             stream_url, video_id, 'mp4', preference=1, m3u8_id='hls'))
  94                     else:
  95                         if server and server.startswith('rtmp'):
  96                             f = {
  97                                 'url': server,
  98                                 'play_path': stream_url,
  99                                 'format_id': 'a%s-rtmp-%s' % (num, quality),
 100                             }
 101                         elif stream_url.startswith('http'):
 102                             f = {
 103                                 'url': stream_url,
 104                                 'format_id': 'a%s-%s-%s' % (num, ext, quality)
 105                             }
 106                         else:
 107                             continue
 108                         m = re.search(r'_(?P<width>\d+)x(?P<height>\d+)\.mp4$', stream_url)
 109                         if m:
 110                             f.update({
 111                                 'width': int(m.group('width')),
 112                                 'height': int(m.group('height')),
 113                             })
 114                         if type_ == 'audio':
 115                             f['vcodec'] = 'none'
 116                         formats.append(f)
 117         return formats
 118
 119     def _real_extract(self, url):
 120         # determine video id from url
 121         m = re.match(self._VALID_URL, url)
 122
 123         numid = re.search(r'documentId=([0-9]+)', url)
 124         if numid:
 125             video_id = numid.group(1)
 126         else:
 127             video_id = m.group('video_id')
 128
 129         webpage = self._download_webpage(url, video_id)
 130
 131         if '>Der gewünschte Beitrag ist nicht mehr verfügbar.<' in webpage:
 132             raise ExtractorError('Video %s is no longer available' % video_id, expected=True)
 133
 134         if 'Diese Sendung ist für Jugendliche unter 12 Jahren nicht geeignet. Der Clip ist deshalb nur von 20 bis 6 Uhr verfügbar.' in webpage:
 135             raise ExtractorError('This program is only suitable for those aged 12 and older. Video %s is therefore only available between 20 pm and 6 am.' % video_id, expected=True)
 136
 137         if re.search(r'[\?&]rss($|[=&])', url):
 138             doc = parse_xml(webpage)
 139             if doc.tag == 'rss':
 140                 return GenericIE()._extract_rss(url, video_id, doc)
 141
 142         title = self._html_search_regex(
 143             [r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>',
 144              r'<meta name="dcterms.title" content="(.*?)"/>',
 145              r'<h4 class="headline">(.*?)</h4>'],
 146             webpage, 'title')
 147         description = self._html_search_meta(
 148             'dcterms.abstract', webpage, 'description', default=None)
 149         if description is None:
 150             description = self._html_search_meta(
 151                 'description', webpage, 'meta description')
 152
 153         # Thumbnail is sometimes not present.
 154         # It is in the mobile version, but that seems to use a different URL
 155         # structure altogether.
 156         thumbnail = self._og_search_thumbnail(webpage, default=None)
 157
 158         media_streams = re.findall(r'''(?x)
 159             mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s*
 160             "([^"]+)"''', webpage)
 161
 162         if media_streams:
 163             QUALITIES = qualities(['lo', 'hi', 'hq'])
 164             formats = []
 165             for furl in set(media_streams):
 166                 if furl.endswith('.f4m'):
 167                     fid = 'f4m'
 168                 else:
 169                     fid_m = re.match(r'.*\.([^.]+)\.[^.]+$', furl)
 170                     fid = fid_m.group(1) if fid_m else None
 171                 formats.append({
 172                     'quality': QUALITIES(fid),
 173                     'format_id': fid,
 174                     'url': furl,
 175                 })
 176             self._sort_formats(formats)
 177             info = {
 178                 'formats': formats,
 179             }
 180         else:  # request JSON file
 181             info = self._extract_media_info(
 182                 'http://www.ardmediathek.de/play/media/%s' % video_id, webpage, video_id)
 183
 184         info.update({
 185             'id': video_id,
 186             'title': title,
 187             'description': description,
 188             'thumbnail': thumbnail,
 189         })
 190
 191         return info
 192
 193
 194 class ARDIE(InfoExtractor):
 195     _VALID_URL = '(?P<mainurl>https?://(www\.)?daserste\.de/[^?#]+/videos/(?P<display_id>[^/?#]+)-(?P<id>[0-9]+))\.html'
 196     _TEST = {
 197         'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html',
 198         'md5': 'd216c3a86493f9322545e045ddc3eb35',
 199         'info_dict': {
 200             'display_id': 'die-story-im-ersten-mission-unter-falscher-flagge',
 201             'id': '100',
 202             'ext': 'mp4',
 203             'duration': 2600,
 204             'title': 'Die Story im Ersten: Mission unter falscher Flagge',
 205             'upload_date': '20140804',
 206             'thumbnail': 're:^https?://.*\.jpg$',
 207         }
 208     }
 209
 210     def _real_extract(self, url):
 211         mobj = re.match(self._VALID_URL, url)
 212         display_id = mobj.group('display_id')
 213
 214         player_url = mobj.group('mainurl') + '~playerXml.xml'
 215         doc = self._download_xml(player_url, display_id)
 216         video_node = doc.find('./video')
 217         upload_date = unified_strdate(xpath_text(
 218             video_node, './broadcastDate'))
 219         thumbnail = xpath_text(video_node, './/teaserImage//variant/url')
 220
 221         formats = []
 222         for a in video_node.findall('.//asset'):
 223             f = {
 224                 'format_id': a.attrib['type'],
 225                 'width': int_or_none(a.find('./frameWidth').text),
 226                 'height': int_or_none(a.find('./frameHeight').text),
 227                 'vbr': int_or_none(a.find('./bitrateVideo').text),
 228                 'abr': int_or_none(a.find('./bitrateAudio').text),
 229                 'vcodec': a.find('./codecVideo').text,
 230                 'tbr': int_or_none(a.find('./totalBitrate').text),
 231             }
 232             if a.find('./serverPrefix').text:
 233                 f['url'] = a.find('./serverPrefix').text
 234                 f['playpath'] = a.find('./fileName').text
 235             else:
 236                 f['url'] = a.find('./fileName').text
 237             formats.append(f)
 238         self._sort_formats(formats)
 239
 240         return {
 241             'id': mobj.group('id'),
 242             'formats': formats,
 243             'display_id': display_id,
 244             'title': video_node.find('./title').text,
 245             'duration': parse_duration(video_node.find('./duration').text),
 246             'upload_date': upload_date,
 247             'thumbnail': thumbnail,
 248         }