yt_dlp/extractor/tagesschau.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import re
   5
   6 from .common import InfoExtractor
   7 from ..utils import (
   8     js_to_json,
   9     extract_attributes,
  10     try_get,
  11     int_or_none,
  12 )
  13
  14
  15 class TagesschauIE(InfoExtractor):
  16     _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/(?P<path>[^/]+/(?:[^/]+/)*?(?P<id>[^/#?]+?(?:-?[0-9]+)?))(?:~_?[^/#?]+?)?\.html'
  17
  18     _TESTS = [{
  19         'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html',
  20         'md5': '7a7287612fa881a1ae1d087df45c2fd6',
  21         'info_dict': {
  22             'id': 'video-102143-1',
  23             'ext': 'mp4',
  24             'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt',
  25         },
  26     }, {
  27         'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html',
  28         'md5': '3c54c1f6243d279b706bde660ceec633',
  29         'info_dict': {
  30             'id': 'ts-5727-1',
  31             'ext': 'mp4',
  32             'title': 'Ganze Sendung',
  33         },
  34     }, {
  35         # exclusive audio
  36         'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417.html',
  37         'md5': '4cf22023c285f35e99c24d290ba58cc9',
  38         'info_dict': {
  39             'id': 'audio-29417-1',
  40             'ext': 'mp3',
  41             'title': 'Brasilianischer Präsident Bolsonaro unter Druck: Corona-Bericht wird vorgestellt',
  42         },
  43     }, {
  44         'url': 'http://www.tagesschau.de/inland/bnd-303.html',
  45         'md5': '12cfb212d9325b5ba0d52b625f1aa61c',
  46         'info_dict': {
  47             'id': 'bnd-303-1',
  48             'ext': 'mp4',
  49             'title': 'SPD-Gruppenbild mit Bärbel Bas nach der Fraktionssitzung | dpa',
  50         },
  51     }, {
  52         'url': 'http://www.tagesschau.de/inland/afd-parteitag-135.html',
  53         'info_dict': {
  54             'id': 'afd-parteitag-135',
  55             'title': 'AfD',
  56         },
  57         'playlist_count': 20,
  58     }, {
  59         'url': 'https://www.tagesschau.de/multimedia/audio/audio-29417~player.html',
  60         'info_dict': {
  61             'id': 'audio-29417-1',
  62             'ext': 'mp3',
  63             'title': 'Brasilianischer Präsident Bolsonaro unter Druck: Corona-Bericht wird vorgestellt',
  64         },
  65     }, {
  66         'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html',
  67         'only_matching': True,
  68     }, {
  69         'url': 'http://www.tagesschau.de/multimedia/sendung/tt-3827.html',
  70         'only_matching': True,
  71     }, {
  72         'url': 'http://www.tagesschau.de/multimedia/sendung/nm-3475.html',
  73         'only_matching': True,
  74     }, {
  75         'url': 'http://www.tagesschau.de/multimedia/sendung/weltspiegel-3167.html',
  76         'only_matching': True,
  77     }, {
  78         'url': 'http://www.tagesschau.de/multimedia/tsvorzwanzig-959.html',
  79         'only_matching': True,
  80     }, {
  81         'url': 'http://www.tagesschau.de/multimedia/sendung/bab/bab-3299~_bab-sendung-209.html',
  82         'only_matching': True,
  83     }, {
  84         'url': 'http://www.tagesschau.de/multimedia/video/video-102303~_bab-sendung-211.html',
  85         'only_matching': True,
  86     }, {
  87         'url': 'http://www.tagesschau.de/100sekunden/index.html',
  88         'only_matching': True,
  89     }, {
  90         # playlist article with collapsing sections
  91         'url': 'http://www.tagesschau.de/wirtschaft/faq-freihandelszone-eu-usa-101.html',
  92         'only_matching': True,
  93     }]
  94
  95     def _real_extract(self, url):
  96         mobj = self._match_valid_url(url)
  97         video_id = mobj.group('id') or mobj.group('path')
  98         display_id = video_id.lstrip('-')
  99
 100         webpage = self._download_webpage(url, display_id)
 101
 102         title = self._html_search_regex(
 103             r'<span[^>]*class="headline"[^>]*>(.+?)</span>',
 104             webpage, 'title', default=None) or self._og_search_title(webpage, fatal=False)
 105
 106         entries = []
 107         videos = re.findall(r'<div[^>]+>', webpage)
 108         num = 0
 109         for video in videos:
 110             video = extract_attributes(video).get('data-config')
 111             if not video:
 112                 continue
 113             video = self._parse_json(video, video_id, transform_source=js_to_json, fatal=False)
 114             video_formats = try_get(video, lambda x: x['mc']['_mediaArray'][0]['_mediaStreamArray'])
 115             if not video_formats:
 116                 continue
 117             num += 1
 118             for video_format in video_formats:
 119                 media_url = video_format.get('_stream') or ''
 120                 formats = []
 121                 if media_url.endswith('master.m3u8'):
 122                     formats = self._extract_m3u8_formats(media_url, video_id, 'mp4', m3u8_id='hls')
 123                 elif media_url.endswith('.hi.mp3') and media_url.startswith('https://download'):
 124                     formats = [{
 125                         'url': media_url,
 126                         'vcodec': 'none',
 127                     }]
 128                 if not formats:
 129                     continue
 130                 entries.append({
 131                     'id': '%s-%d' % (display_id, num),
 132                     'title': try_get(video, lambda x: x['mc']['_title']),
 133                     'duration': int_or_none(try_get(video, lambda x: x['mc']['_duration'])),
 134                     'formats': formats
 135                 })
 136         if len(entries) > 1:
 137             return self.playlist_result(entries, display_id, title)
 138         formats = entries[0]['formats']
 139         video_info = self._search_json_ld(webpage, video_id)
 140         description = video_info.get('description')
 141         thumbnail = self._og_search_thumbnail(webpage) or video_info.get('thumbnail')
 142         timestamp = video_info.get('timestamp')
 143         title = title or video_info.get('description')
 144
 145         self._sort_formats(formats)
 146
 147         return {
 148             'id': display_id,
 149             'title': title,
 150             'thumbnail': thumbnail,
 151             'formats': formats,
 152             'timestamp': timestamp,
 153             'description': description,
 154         }