youtube_dlc/extractor/malltv.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 from .common import InfoExtractor
   5 from ..utils import (
   6     clean_html,
   7     dict_get,
   8     float_or_none,
   9     int_or_none,
  10     merge_dicts,
  11     parse_duration,
  12     try_get,
  13 )
  14
  15
  16 class MallTVIE(InfoExtractor):
  17     _VALID_URL = r'https?://(?:(?:www|sk)\.)?mall\.tv/(?:[^/]+/)*(?P<id>[^/?#&]+)'
  18     _TESTS = [{
  19         'url': 'https://www.mall.tv/18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice',
  20         'md5': '1c4a37f080e1f3023103a7b43458e518',
  21         'info_dict': {
  22             'id': 't0zzt0',
  23             'display_id': '18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice',
  24             'ext': 'mp4',
  25             'title': '18 miliard pro neziskovky. Opravdu jsou sportovci nebo Člověk v tísni pijavice?',
  26             'description': 'md5:db7d5744a4bd4043d9d98324aa72ab35',
  27             'duration': 216,
  28             'timestamp': 1538870400,
  29             'upload_date': '20181007',
  30             'view_count': int,
  31         }
  32     }, {
  33         'url': 'https://www.mall.tv/kdo-to-plati/18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice',
  34         'only_matching': True,
  35     }, {
  36         'url': 'https://sk.mall.tv/gejmhaus/reklamacia-nehreje-vyrobnik-tepla-alebo-spekacka',
  37         'only_matching': True,
  38     }]
  39
  40     def _real_extract(self, url):
  41         display_id = self._match_id(url)
  42
  43         webpage = self._download_webpage(
  44             url, display_id, headers=self.geo_verification_headers())
  45
  46         video = self._parse_json(self._search_regex(
  47             r'videoObject\s*=\s*JSON\.parse\(JSON\.stringify\(({.+?})\)\);',
  48             webpage, 'video object'), display_id)
  49         video_source = video['VideoSource']
  50         video_id = self._search_regex(
  51             r'/([\da-z]+)/index\b', video_source, 'video id')
  52
  53         formats = self._extract_m3u8_formats(
  54             video_source + '.m3u8', video_id, 'mp4', 'm3u8_native')
  55         self._sort_formats(formats)
  56
  57         subtitles = {}
  58         for s in (video.get('Subtitles') or {}):
  59             s_url = s.get('Url')
  60             if not s_url:
  61                 continue
  62             subtitles.setdefault(s.get('Language') or 'cz', []).append({
  63                 'url': s_url,
  64             })
  65
  66         entity_counts = video.get('EntityCounts') or {}
  67
  68         def get_count(k):
  69             v = entity_counts.get(k + 's') or {}
  70             return int_or_none(dict_get(v, ('Count', 'StrCount')))
  71
  72         info = self._search_json_ld(webpage, video_id, default={})
  73
  74         return merge_dicts({
  75             'id': video_id,
  76             'display_id': display_id,
  77             'title': video.get('Title'),
  78             'description': clean_html(video.get('Description')),
  79             'thumbnail': video.get('ThumbnailUrl'),
  80             'formats': formats,
  81             'subtitles': subtitles,
  82             'duration': int_or_none(video.get('DurationSeconds')) or parse_duration(video.get('Duration')),
  83             'view_count': get_count('View'),
  84             'like_count': get_count('Like'),
  85             'dislike_count': get_count('Dislike'),
  86             'average_rating': float_or_none(try_get(video, lambda x: x['EntityRating']['AvarageRate'])),
  87             'comment_count': get_count('Comment'),
  88         }, info)