yt_dlp/extractor/kusi.py

   1 import random
   2 import urllib.parse
   3
   4 from .common import InfoExtractor
   5 from ..utils import (
   6     float_or_none,
   7     int_or_none,
   8     timeconvert,
   9     update_url_query,
  10     xpath_text,
  11 )
  12
  13
  14 class KUSIIE(InfoExtractor):
  15     _VALID_URL = r'https?://(?:www\.)?kusi\.com/(?P<path>story/.+|video\?clipId=(?P<clipId>\d+))'
  16     _TESTS = [{
  17         'url': 'http://www.kusi.com/story/32849881/turko-files-refused-to-help-it-aint-right',
  18         'md5': '4e76ce8e53660ce9697d06c0ba6fc47d',
  19         'info_dict': {
  20             'id': '12689020',
  21             'ext': 'mp4',
  22             'title': "Turko Files: Refused to Help, It Ain't Right!",
  23             'duration': 223.586,
  24             'upload_date': '20160826',
  25             'timestamp': 1472233118,
  26             'thumbnail': r're:^https?://.*\.jpg$'
  27         },
  28     }, {
  29         'url': 'http://kusi.com/video?clipId=12203019',
  30         'only_matching': True,
  31     }]
  32
  33     def _real_extract(self, url):
  34         mobj = self._match_valid_url(url)
  35         clip_id = mobj.group('clipId')
  36         video_id = clip_id or mobj.group('path')
  37
  38         webpage = self._download_webpage(url, video_id)
  39
  40         if clip_id is None:
  41             video_id = clip_id = self._html_search_regex(
  42                 r'"clipId"\s*,\s*"(\d+)"', webpage, 'clip id')
  43
  44         affiliate_id = self._search_regex(
  45             r'affiliateId\s*:\s*\'([^\']+)\'', webpage, 'affiliate id')
  46
  47         # See __Packages/worldnow/model/GalleryModel.as of WNGallery.swf
  48         xml_url = update_url_query('http://www.kusi.com/build.asp', {
  49             'buildtype': 'buildfeaturexmlrequest',
  50             'featureType': 'Clip',
  51             'featureid': clip_id,
  52             'affiliateno': affiliate_id,
  53             'clientgroupid': '1',
  54             'rnd': int(round(random.random() * 1000000)),
  55         })
  56
  57         doc = self._download_xml(xml_url, video_id)
  58
  59         video_title = xpath_text(doc, 'HEADLINE', fatal=True)
  60         duration = float_or_none(xpath_text(doc, 'DURATION'), scale=1000)
  61         description = xpath_text(doc, 'ABSTRACT')
  62         thumbnail = xpath_text(doc, './THUMBNAILIMAGE/FILENAME')
  63         creation_time = timeconvert(xpath_text(doc, 'rfc822creationdate'))
  64
  65         quality_options = doc.find('{http://search.yahoo.com/mrss/}group').findall('{http://search.yahoo.com/mrss/}content')
  66         formats = []
  67         for quality in quality_options:
  68             formats.append({
  69                 'url': urllib.parse.unquote_plus(quality.attrib['url']),
  70                 'height': int_or_none(quality.attrib.get('height')),
  71                 'width': int_or_none(quality.attrib.get('width')),
  72                 'vbr': float_or_none(quality.attrib.get('bitratebits'), scale=1000),
  73             })
  74
  75         return {
  76             'id': video_id,
  77             'title': video_title,
  78             'description': description,
  79             'duration': duration,
  80             'formats': formats,
  81             'thumbnail': thumbnail,
  82             'timestamp': creation_time,
  83         }