yt_dlp/extractor/sina.py

   1 from .common import InfoExtractor
   2 from ..networking import HEADRequest
   3 from ..utils import (
   4     ExtractorError,
   5     clean_html,
   6     get_element_by_attribute,
   7     int_or_none,
   8     qualities,
   9     update_url_query,
  10 )
  11
  12
  13 class SinaIE(InfoExtractor):
  14     _VALID_URL = r'''(?x)https?://(?:[^/?#]+\.)?video\.sina\.com\.cn/
  15                         (?:
  16                             (?:view/|.*\#)(?P<id>\d+)|
  17                             .+?/(?P<pseudo_id>[^/?#]+)(?:\.s?html)|
  18                             # This is used by external sites like Weibo
  19                             api/sinawebApi/outplay.php/(?P<token>.+?)\.swf
  20                         )
  21                   '''
  22
  23     _TESTS = [
  24         {
  25             'url': 'http://video.sina.com.cn/news/spj/topvideoes20160504/?opsubject_id=top1#250576622',
  26             'md5': 'd38433e2fc886007729735650ae4b3e9',
  27             'info_dict': {
  28                 'id': '250576622',
  29                 'ext': 'mp4',
  30                 'title': '现场:克鲁兹宣布退选 特朗普将稳获提名',
  31             },
  32         },
  33         {
  34             'url': 'http://video.sina.com.cn/v/b/101314253-1290078633.html',
  35             'info_dict': {
  36                 'id': '101314253',
  37                 'ext': 'flv',
  38                 'title': '军方提高对朝情报监视级别',
  39             },
  40             'skip': 'the page does not exist or has been deleted',
  41         },
  42         {
  43             'url': 'http://video.sina.com.cn/view/250587748.html',
  44             'md5': '3d1807a25c775092aab3bc157fff49b4',
  45             'info_dict': {
  46                 'id': '250587748',
  47                 'ext': 'mp4',
  48                 'title': '瞬间泪目：8年前汶川地震珍贵视频首曝光',
  49             },
  50         },
  51     ]
  52
  53     def _real_extract(self, url):
  54         mobj = self._match_valid_url(url)
  55
  56         video_id = mobj.group('id')
  57         if not video_id:
  58             if mobj.group('token') is not None:
  59                 # The video id is in the redirected url
  60                 self.to_screen('Getting video id')
  61                 request = HEADRequest(url)
  62                 _, urlh = self._download_webpage_handle(request, 'NA', False)
  63                 return self._real_extract(urlh.url)
  64             else:
  65                 pseudo_id = mobj.group('pseudo_id')
  66                 webpage = self._download_webpage(url, pseudo_id)
  67                 error = get_element_by_attribute('class', 'errtitle', webpage)
  68                 if error:
  69                     raise ExtractorError(f'{self.IE_NAME} said: {clean_html(error)}', expected=True)
  70                 video_id = self._search_regex(
  71                     r"video_id\s*:\s*'(\d+)'", webpage, 'video id')
  72
  73         video_data = self._download_json(
  74             'http://s.video.sina.com.cn/video/h5play',
  75             video_id, query={'video_id': video_id})
  76         if video_data['code'] != 1:
  77             raise ExtractorError('{} said: {}'.format(
  78                 self.IE_NAME, video_data['message']), expected=True)
  79         else:
  80             video_data = video_data['data']
  81             title = video_data['title']
  82             description = video_data.get('description')
  83             if description:
  84                 description = description.strip()
  85
  86             preference = qualities(['cif', 'sd', 'hd', 'fhd', 'ffd'])
  87             formats = []
  88             for quality_id, quality in video_data.get('videos', {}).get('mp4', {}).items():
  89                 file_api = quality.get('file_api')
  90                 file_id = quality.get('file_id')
  91                 if not file_api or not file_id:
  92                     continue
  93                 formats.append({
  94                     'format_id': quality_id,
  95                     'url': update_url_query(file_api, {'vid': file_id}),
  96                     'quality': preference(quality_id),
  97                     'ext': 'mp4',
  98                 })
  99
 100             return {
 101                 'id': video_id,
 102                 'title': title,
 103                 'description': description,
 104                 'thumbnail': video_data.get('image'),
 105                 'duration': int_or_none(video_data.get('length')),
 106                 'timestamp': int_or_none(video_data.get('create_time')),
 107                 'formats': formats,
 108             }