yt_dlp/extractor/sina.py

   1 from .common import InfoExtractor
   2 from ..utils import (
   3     HEADRequest,
   4     ExtractorError,
   5     int_or_none,
   6     update_url_query,
   7     qualities,
   8     get_element_by_attribute,
   9     clean_html,
  10 )
  11
  12
  13 class SinaIE(InfoExtractor):
  14     _VALID_URL = r'''(?x)https?://(?:.*?\.)?video\.sina\.com\.cn/
  15                         (?:
  16                             (?:view/|.*\#)(?P<id>\d+)|
  17                             .+?/(?P<pseudo_id>[^/?#]+)(?:\.s?html)|
  18                             # This is used by external sites like Weibo
  19                             api/sinawebApi/outplay.php/(?P<token>.+?)\.swf
  20                         )
  21                   '''
  22
  23     _TESTS = [
  24         {
  25             'url': 'http://video.sina.com.cn/news/spj/topvideoes20160504/?opsubject_id=top1#250576622',
  26             'md5': 'd38433e2fc886007729735650ae4b3e9',
  27             'info_dict': {
  28                 'id': '250576622',
  29                 'ext': 'mp4',
  30                 'title': '现场:克鲁兹宣布退选 特朗普将稳获提名',
  31             }
  32         },
  33         {
  34             'url': 'http://video.sina.com.cn/v/b/101314253-1290078633.html',
  35             'info_dict': {
  36                 'id': '101314253',
  37                 'ext': 'flv',
  38                 'title': '军方提高对朝情报监视级别',
  39             },
  40             'skip': 'the page does not exist or has been deleted',
  41         },
  42         {
  43             'url': 'http://video.sina.com.cn/view/250587748.html',
  44             'md5': '3d1807a25c775092aab3bc157fff49b4',
  45             'info_dict': {
  46                 'id': '250587748',
  47                 'ext': 'mp4',
  48                 'title': '瞬间泪目：8年前汶川地震珍贵视频首曝光',
  49             },
  50         },
  51     ]
  52
  53     def _real_extract(self, url):
  54         mobj = self._match_valid_url(url)
  55
  56         video_id = mobj.group('id')
  57         if not video_id:
  58             if mobj.group('token') is not None:
  59                 # The video id is in the redirected url
  60                 self.to_screen('Getting video id')
  61                 request = HEADRequest(url)
  62                 _, urlh = self._download_webpage_handle(request, 'NA', False)
  63                 return self._real_extract(urlh.geturl())
  64             else:
  65                 pseudo_id = mobj.group('pseudo_id')
  66                 webpage = self._download_webpage(url, pseudo_id)
  67                 error = get_element_by_attribute('class', 'errtitle', webpage)
  68                 if error:
  69                     raise ExtractorError('%s said: %s' % (
  70                         self.IE_NAME, clean_html(error)), expected=True)
  71                 video_id = self._search_regex(
  72                     r"video_id\s*:\s*'(\d+)'", webpage, 'video id')
  73
  74         video_data = self._download_json(
  75             'http://s.video.sina.com.cn/video/h5play',
  76             video_id, query={'video_id': video_id})
  77         if video_data['code'] != 1:
  78             raise ExtractorError('%s said: %s' % (
  79                 self.IE_NAME, video_data['message']), expected=True)
  80         else:
  81             video_data = video_data['data']
  82             title = video_data['title']
  83             description = video_data.get('description')
  84             if description:
  85                 description = description.strip()
  86
  87             preference = qualities(['cif', 'sd', 'hd', 'fhd', 'ffd'])
  88             formats = []
  89             for quality_id, quality in video_data.get('videos', {}).get('mp4', {}).items():
  90                 file_api = quality.get('file_api')
  91                 file_id = quality.get('file_id')
  92                 if not file_api or not file_id:
  93                     continue
  94                 formats.append({
  95                     'format_id': quality_id,
  96                     'url': update_url_query(file_api, {'vid': file_id}),
  97                     'quality': preference(quality_id),
  98                     'ext': 'mp4',
  99                 })
 100
 101             return {
 102                 'id': video_id,
 103                 'title': title,
 104                 'description': description,
 105                 'thumbnail': video_data.get('image'),
 106                 'duration': int_or_none(video_data.get('length')),
 107                 'timestamp': int_or_none(video_data.get('create_time')),
 108                 'formats': formats,
 109             }