youtube_dl/extractor/ard.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import re
   5
   6 from .common import InfoExtractor
   7 from .generic import GenericIE
   8 from ..utils import (
   9     determine_ext,
  10     ExtractorError,
  11     get_element_by_attribute,
  12     qualities,
  13     int_or_none,
  14     parse_duration,
  15     unified_strdate,
  16     xpath_text,
  17     parse_xml,
  18 )
  19
  20
  21 class ARDMediathekIE(InfoExtractor):
  22     IE_NAME = 'ARD:mediathek'
  23     _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'
  24
  25     _TESTS = [{
  26         'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht',
  27         'only_matching': True,
  28     }, {
  29         'url': 'http://www.ardmediathek.de/tv/Tatort/Das-Wunder-von-Wolbeck-Video-tgl-ab-20/Das-Erste/Video?documentId=22490580&bcastId=602916',
  30         'info_dict': {
  31             'id': '22490580',
  32             'ext': 'mp4',
  33             'title': 'Das Wunder von Wolbeck (Video tgl. ab 20 Uhr)',
  34             'description': 'Auf einem restaurierten Hof bei Wolbeck wird der Heilpraktiker Raffael Lembeck eines morgens von seiner Frau Stella tot aufgefunden. Das Opfer war offensichtlich in seiner Praxis zu Fall gekommen und ist dann verblutet, erklärt Prof. Boerne am Tatort.',
  35         },
  36         'skip': 'Blocked outside of Germany',
  37     }, {
  38         # audio
  39         'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086',
  40         'md5': '219d94d8980b4f538c7fcb0865eb7f2c',
  41         'info_dict': {
  42             'id': '28488308',
  43             'ext': 'mp3',
  44             'title': 'Tod eines Fußballers',
  45             'description': 'md5:f6e39f3461f0e1f54bfa48c8875c86ef',
  46             'duration': 3240,
  47         },
  48     }]
  49
  50     def _extract_media_info(self, media_info_url, webpage, video_id):
  51         media_info = self._download_json(
  52             media_info_url, video_id, 'Downloading media JSON')
  53
  54         formats = self._extract_formats(media_info, video_id)
  55
  56         if not formats:
  57             if '"fsk"' in webpage:
  58                 raise ExtractorError(
  59                     'This video is only available after 20:00', expected=True)
  60             elif media_info.get('_geoblocked'):
  61                 raise ExtractorError('This video is not available due to geo restriction', expected=True)
  62
  63         self._sort_formats(formats)
  64
  65         duration = int_or_none(media_info.get('_duration'))
  66         thumbnail = media_info.get('_previewImage')
  67
  68         subtitles = {}
  69         subtitle_url = media_info.get('_subtitleUrl')
  70         if subtitle_url:
  71             subtitles['de'] = [{
  72                 'ext': 'srt',
  73                 'url': subtitle_url,
  74             }]
  75
  76         return {
  77             'id': video_id,
  78             'duration': duration,
  79             'thumbnail': thumbnail,
  80             'formats': formats,
  81             'subtitles': subtitles,
  82         }
  83
  84     def _extract_formats(self, media_info, video_id):
  85         type_ = media_info.get('_type')
  86         media_array = media_info.get('_mediaArray', [])
  87         formats = []
  88         for num, media in enumerate(media_array):
  89             for stream in media.get('_mediaStreamArray', []):
  90                 stream_urls = stream.get('_stream')
  91                 if not stream_urls:
  92                     continue
  93                 if not isinstance(stream_urls, list):
  94                     stream_urls = [stream_urls]
  95                 quality = stream.get('_quality')
  96                 server = stream.get('_server')
  97                 for stream_url in stream_urls:
  98                     ext = determine_ext(stream_url)
  99                     if ext == 'f4m':
 100                         formats.extend(self._extract_f4m_formats(
 101                             stream_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124',
 102                             video_id, preference=-1, f4m_id='hds'))
 103                     elif ext == 'm3u8':
 104                         formats.extend(self._extract_m3u8_formats(
 105                             stream_url, video_id, 'mp4', preference=1, m3u8_id='hls'))
 106                     else:
 107                         if server and server.startswith('rtmp'):
 108                             f = {
 109                                 'url': server,
 110                                 'play_path': stream_url,
 111                                 'format_id': 'a%s-rtmp-%s' % (num, quality),
 112                             }
 113                         elif stream_url.startswith('http'):
 114                             f = {
 115                                 'url': stream_url,
 116                                 'format_id': 'a%s-%s-%s' % (num, ext, quality)
 117                             }
 118                         else:
 119                             continue
 120                         m = re.search(r'_(?P<width>\d+)x(?P<height>\d+)\.mp4$', stream_url)
 121                         if m:
 122                             f.update({
 123                                 'width': int(m.group('width')),
 124                                 'height': int(m.group('height')),
 125                             })
 126                         if type_ == 'audio':
 127                             f['vcodec'] = 'none'
 128                         formats.append(f)
 129         return formats
 130
 131     def _real_extract(self, url):
 132         # determine video id from url
 133         m = re.match(self._VALID_URL, url)
 134
 135         numid = re.search(r'documentId=([0-9]+)', url)
 136         if numid:
 137             video_id = numid.group(1)
 138         else:
 139             video_id = m.group('video_id')
 140
 141         webpage = self._download_webpage(url, video_id)
 142
 143         if '>Der gewünschte Beitrag ist nicht mehr verfügbar.<' in webpage:
 144             raise ExtractorError('Video %s is no longer available' % video_id, expected=True)
 145
 146         if 'Diese Sendung ist für Jugendliche unter 12 Jahren nicht geeignet. Der Clip ist deshalb nur von 20 bis 6 Uhr verfügbar.' in webpage:
 147             raise ExtractorError('This program is only suitable for those aged 12 and older. Video %s is therefore only available between 20 pm and 6 am.' % video_id, expected=True)
 148
 149         if re.search(r'[\?&]rss($|[=&])', url):
 150             doc = parse_xml(webpage)
 151             if doc.tag == 'rss':
 152                 return GenericIE()._extract_rss(url, video_id, doc)
 153
 154         title = self._html_search_regex(
 155             [r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>',
 156              r'<meta name="dcterms.title" content="(.*?)"/>',
 157              r'<h4 class="headline">(.*?)</h4>'],
 158             webpage, 'title')
 159         description = self._html_search_meta(
 160             'dcterms.abstract', webpage, 'description', default=None)
 161         if description is None:
 162             description = self._html_search_meta(
 163                 'description', webpage, 'meta description')
 164
 165         # Thumbnail is sometimes not present.
 166         # It is in the mobile version, but that seems to use a different URL
 167         # structure altogether.
 168         thumbnail = self._og_search_thumbnail(webpage, default=None)
 169
 170         media_streams = re.findall(r'''(?x)
 171             mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s*
 172             "([^"]+)"''', webpage)
 173
 174         if media_streams:
 175             QUALITIES = qualities(['lo', 'hi', 'hq'])
 176             formats = []
 177             for furl in set(media_streams):
 178                 if furl.endswith('.f4m'):
 179                     fid = 'f4m'
 180                 else:
 181                     fid_m = re.match(r'.*\.([^.]+)\.[^.]+$', furl)
 182                     fid = fid_m.group(1) if fid_m else None
 183                 formats.append({
 184                     'quality': QUALITIES(fid),
 185                     'format_id': fid,
 186                     'url': furl,
 187                 })
 188             self._sort_formats(formats)
 189             info = {
 190                 'formats': formats,
 191             }
 192         else:  # request JSON file
 193             info = self._extract_media_info(
 194                 'http://www.ardmediathek.de/play/media/%s' % video_id, webpage, video_id)
 195
 196         info.update({
 197             'id': video_id,
 198             'title': title,
 199             'description': description,
 200             'thumbnail': thumbnail,
 201         })
 202
 203         return info
 204
 205
 206 class ARDIE(InfoExtractor):
 207     _VALID_URL = '(?P<mainurl>https?://(www\.)?daserste\.de/[^?#]+/videos/(?P<display_id>[^/?#]+)-(?P<id>[0-9]+))\.html'
 208     _TEST = {
 209         'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html',
 210         'md5': 'd216c3a86493f9322545e045ddc3eb35',
 211         'info_dict': {
 212             'display_id': 'die-story-im-ersten-mission-unter-falscher-flagge',
 213             'id': '100',
 214             'ext': 'mp4',
 215             'duration': 2600,
 216             'title': 'Die Story im Ersten: Mission unter falscher Flagge',
 217             'upload_date': '20140804',
 218             'thumbnail': 're:^https?://.*\.jpg$',
 219         }
 220     }
 221
 222     def _real_extract(self, url):
 223         mobj = re.match(self._VALID_URL, url)
 224         display_id = mobj.group('display_id')
 225
 226         player_url = mobj.group('mainurl') + '~playerXml.xml'
 227         doc = self._download_xml(player_url, display_id)
 228         video_node = doc.find('./video')
 229         upload_date = unified_strdate(xpath_text(
 230             video_node, './broadcastDate'))
 231         thumbnail = xpath_text(video_node, './/teaserImage//variant/url')
 232
 233         formats = []
 234         for a in video_node.findall('.//asset'):
 235             f = {
 236                 'format_id': a.attrib['type'],
 237                 'width': int_or_none(a.find('./frameWidth').text),
 238                 'height': int_or_none(a.find('./frameHeight').text),
 239                 'vbr': int_or_none(a.find('./bitrateVideo').text),
 240                 'abr': int_or_none(a.find('./bitrateAudio').text),
 241                 'vcodec': a.find('./codecVideo').text,
 242                 'tbr': int_or_none(a.find('./totalBitrate').text),
 243             }
 244             if a.find('./serverPrefix').text:
 245                 f['url'] = a.find('./serverPrefix').text
 246                 f['playpath'] = a.find('./fileName').text
 247             else:
 248                 f['url'] = a.find('./fileName').text
 249             formats.append(f)
 250         self._sort_formats(formats)
 251
 252         return {
 253             'id': mobj.group('id'),
 254             'formats': formats,
 255             'display_id': display_id,
 256             'title': video_node.find('./title').text,
 257             'duration': parse_duration(video_node.find('./duration').text),
 258             'upload_date': upload_date,
 259             'thumbnail': thumbnail,
 260         }
 261
 262
 263 class SportschauIE(ARDMediathekIE):
 264     IE_NAME = 'Sportschau'
 265     _VALID_URL = r'(?P<baseurl>https?://(?:www\.)?sportschau\.de/(?:[^/]+/)+video(?P<id>[^/#?]+))\.html'
 266     _TESTS = [{
 267         'url': 'http://www.sportschau.de/tourdefrance/videoseppeltkokainhatnichtsmitklassischemdopingzutun100.html',
 268         'info_dict': {
 269             'id': 'seppeltkokainhatnichtsmitklassischemdopingzutun100',
 270             'ext': 'mp4',
 271             'title': 'Seppelt: "Kokain hat nichts mit klassischem Doping zu tun"',
 272             'thumbnail': 're:^https?://.*\.jpg$',
 273             'description': 'Der ARD-Doping Experte Hajo Seppelt gibt seine Einschätzung zum ersten Dopingfall der diesjährigen Tour de France um den Italiener Luca Paolini ab.',
 274         },
 275         'params': {
 276             # m3u8 download
 277             'skip_download': True,
 278         },
 279     }]
 280
 281     def _real_extract(self, url):
 282         mobj = re.match(self._VALID_URL, url)
 283         video_id = mobj.group('id')
 284         base_url = mobj.group('baseurl')
 285
 286         webpage = self._download_webpage(url, video_id)
 287         title = get_element_by_attribute('class', 'headline', webpage)
 288         description = self._html_search_meta('description', webpage, 'description')
 289
 290         info = self._extract_media_info(
 291             base_url + '-mc_defaultQuality-h.json', webpage, video_id)
 292
 293         info.update({
 294             'title': title,
 295             'description': description,
 296         })
 297
 298         return info