youtube_dlc/extractor/arte.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import re
   5
   6 from .common import InfoExtractor
   7 from ..compat import (
   8     compat_str,
   9     compat_urlparse,
  10 )
  11 from ..utils import (
  12     ExtractorError,
  13     int_or_none,
  14     qualities,
  15     try_get,
  16     unified_strdate,
  17     url_or_none,
  18 )
  19
  20
  21 class ArteTVBaseIE(InfoExtractor):
  22     _ARTE_LANGUAGES = 'fr|de|en|es|it|pl'
  23     _API_BASE = 'https://api.arte.tv/api/player/v1'
  24
  25
  26 class ArteTVIE(ArteTVBaseIE):
  27     _VALID_URL = r'''(?x)
  28                     https?://
  29                         (?:
  30                             (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos|
  31                             api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
  32                         )
  33                         /(?P<id>\d{6}-\d{3}-[AF])
  34                     ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
  35     _TESTS = [{
  36         'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
  37         'info_dict': {
  38             'id': '088501-000-A',
  39             'ext': 'mp4',
  40             'title': 'Mexico: Stealing Petrol to Survive',
  41             'upload_date': '20190628',
  42         },
  43     }, {
  44         'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/',
  45         'only_matching': True,
  46     }, {
  47         'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
  48         'only_matching': True,
  49     }]
  50
  51     def _real_extract(self, url):
  52         mobj = re.match(self._VALID_URL, url)
  53         video_id = mobj.group('id')
  54         lang = mobj.group('lang') or mobj.group('lang_2')
  55
  56         info = self._download_json(
  57             '%s/config/%s/%s' % (self._API_BASE, lang, video_id), video_id)
  58         player_info = info['videoJsonPlayer']
  59
  60         vsr = try_get(player_info, lambda x: x['VSR'], dict)
  61         if not vsr:
  62             error = None
  63             if try_get(player_info, lambda x: x['custom_msg']['type']) == 'error':
  64                 error = try_get(
  65                     player_info, lambda x: x['custom_msg']['msg'], compat_str)
  66             if not error:
  67                 error = 'Video %s is not available' % player_info.get('VID') or video_id
  68             raise ExtractorError(error, expected=True)
  69
  70         upload_date_str = player_info.get('shootingDate')
  71         if not upload_date_str:
  72             upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0]
  73
  74         title = (player_info.get('VTI') or player_info['VID']).strip()
  75         subtitle = player_info.get('VSU', '').strip()
  76         if subtitle:
  77             title += ' - %s' % subtitle
  78
  79         qfunc = qualities(['MQ', 'HQ', 'EQ', 'SQ'])
  80
  81         LANGS = {
  82             'fr': 'F',
  83             'de': 'A',
  84             'en': 'E[ANG]',
  85             'es': 'E[ESP]',
  86             'it': 'E[ITA]',
  87             'pl': 'E[POL]',
  88         }
  89
  90         langcode = LANGS.get(lang, lang)
  91
  92         formats = []
  93         for format_id, format_dict in vsr.items():
  94             f = dict(format_dict)
  95             format_url = url_or_none(f.get('url'))
  96             streamer = f.get('streamer')
  97             if not format_url and not streamer:
  98                 continue
  99             versionCode = f.get('versionCode')
 100             l = re.escape(langcode)
 101
 102             # Language preference from most to least priority
 103             # Reference: section 6.8 of
 104             # https://www.arte.tv/sites/en/corporate/files/complete-technical-guidelines-arte-geie-v1-07-1.pdf
 105             PREFERENCES = (
 106                 # original version in requested language, without subtitles
 107                 r'VO{0}$'.format(l),
 108                 # original version in requested language, with partial subtitles in requested language
 109                 r'VO{0}-ST{0}$'.format(l),
 110                 # original version in requested language, with subtitles for the deaf and hard-of-hearing in requested language
 111                 r'VO{0}-STM{0}$'.format(l),
 112                 # non-original (dubbed) version in requested language, without subtitles
 113                 r'V{0}$'.format(l),
 114                 # non-original (dubbed) version in requested language, with subtitles partial subtitles in requested language
 115                 r'V{0}-ST{0}$'.format(l),
 116                 # non-original (dubbed) version in requested language, with subtitles for the deaf and hard-of-hearing in requested language
 117                 r'V{0}-STM{0}$'.format(l),
 118                 # original version in requested language, with partial subtitles in different language
 119                 r'VO{0}-ST(?!{0}).+?$'.format(l),
 120                 # original version in requested language, with subtitles for the deaf and hard-of-hearing in different language
 121                 r'VO{0}-STM(?!{0}).+?$'.format(l),
 122                 # original version in different language, with partial subtitles in requested language
 123                 r'VO(?:(?!{0}).+?)?-ST{0}$'.format(l),
 124                 # original version in different language, with subtitles for the deaf and hard-of-hearing in requested language
 125                 r'VO(?:(?!{0}).+?)?-STM{0}$'.format(l),
 126                 # original version in different language, without subtitles
 127                 r'VO(?:(?!{0}))?$'.format(l),
 128                 # original version in different language, with partial subtitles in different language
 129                 r'VO(?:(?!{0}).+?)?-ST(?!{0}).+?$'.format(l),
 130                 # original version in different language, with subtitles for the deaf and hard-of-hearing in different language
 131                 r'VO(?:(?!{0}).+?)?-STM(?!{0}).+?$'.format(l),
 132             )
 133
 134             for pref, p in enumerate(PREFERENCES):
 135                 if re.match(p, versionCode):
 136                     lang_pref = len(PREFERENCES) - pref
 137                     break
 138             else:
 139                 lang_pref = -1
 140
 141             media_type = f.get('mediaType')
 142             if media_type == 'hls':
 143                 m3u8_formats = self._extract_m3u8_formats(
 144                     format_url, video_id, 'mp4', entry_protocol='m3u8_native',
 145                     m3u8_id=format_id, fatal=False)
 146                 for m3u8_format in m3u8_formats:
 147                     m3u8_format['language_preference'] = lang_pref
 148                 formats.extend(m3u8_formats)
 149                 continue
 150
 151             format = {
 152                 'format_id': format_id,
 153                 'language_preference': lang_pref,
 154                 'format_note': '%s, %s' % (f.get('versionCode'), f.get('versionLibelle')),
 155                 'width': int_or_none(f.get('width')),
 156                 'height': int_or_none(f.get('height')),
 157                 'tbr': int_or_none(f.get('bitrate')),
 158                 'quality': qfunc(f.get('quality')),
 159             }
 160
 161             if media_type == 'rtmp':
 162                 format['url'] = f['streamer']
 163                 format['play_path'] = 'mp4:' + f['url']
 164                 format['ext'] = 'flv'
 165             else:
 166                 format['url'] = f['url']
 167
 168             formats.append(format)
 169
 170         # For this extractor, quality only represents the relative quality
 171         # with respect to other formats with the same resolution
 172         self._sort_formats(formats, ('res', 'quality'))
 173
 174         return {
 175             'id': player_info.get('VID') or video_id,
 176             'title': title,
 177             'description': player_info.get('VDE'),
 178             'upload_date': unified_strdate(upload_date_str),
 179             'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'),
 180             'formats': formats,
 181         }
 182
 183
 184 class ArteTVEmbedIE(InfoExtractor):
 185     _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
 186     _TESTS = [{
 187         'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
 188         'info_dict': {
 189             'id': '100605-013-A',
 190             'ext': 'mp4',
 191             'title': 'United we Stream November Lockdown Edition #13',
 192             'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
 193             'upload_date': '20201116',
 194         },
 195     }, {
 196         'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
 197         'only_matching': True,
 198     }]
 199
 200     @staticmethod
 201     def _extract_urls(webpage):
 202         return [url for _, url in re.findall(
 203             r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1',
 204             webpage)]
 205
 206     def _real_extract(self, url):
 207         qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
 208         json_url = qs['json_url'][0]
 209         video_id = ArteTVIE._match_id(json_url)
 210         return self.url_result(
 211             json_url, ie=ArteTVIE.ie_key(), video_id=video_id)
 212
 213
 214 class ArteTVPlaylistIE(ArteTVBaseIE):
 215     _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES
 216     _TESTS = [{
 217         'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
 218         'info_dict': {
 219             'id': 'RC-016954',
 220             'title': 'Earn a Living',
 221             'description': 'md5:d322c55011514b3a7241f7fb80d494c2',
 222         },
 223         'playlist_mincount': 6,
 224     }, {
 225         'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
 226         'only_matching': True,
 227     }]
 228
 229     def _real_extract(self, url):
 230         lang, playlist_id = re.match(self._VALID_URL, url).groups()
 231         collection = self._download_json(
 232             '%s/collectionData/%s/%s?source=videos'
 233             % (self._API_BASE, lang, playlist_id), playlist_id)
 234         entries = []
 235         for video in collection['videos']:
 236             if not isinstance(video, dict):
 237                 continue
 238             video_url = url_or_none(video.get('url')) or url_or_none(video.get('jsonUrl'))
 239             if not video_url:
 240                 continue
 241             video_id = video.get('programId')
 242             entries.append({
 243                 '_type': 'url_transparent',
 244                 'url': video_url,
 245                 'id': video_id,
 246                 'title': video.get('title'),
 247                 'alt_title': video.get('subtitle'),
 248                 'thumbnail': url_or_none(try_get(video, lambda x: x['mainImage']['url'], compat_str)),
 249                 'duration': int_or_none(video.get('durationSeconds')),
 250                 'view_count': int_or_none(video.get('views')),
 251                 'ie_key': ArteTVIE.ie_key(),
 252             })
 253         title = collection.get('title')
 254         description = collection.get('shortDescription') or collection.get('teaserText')
 255         return self.playlist_result(entries, playlist_id, title, description)