yt_dlp/extractor/ustudio.py

   1 from __future__ import unicode_literals
   2
   3
   4 from .common import InfoExtractor
   5 from ..utils import (
   6     int_or_none,
   7     unified_strdate,
   8     unescapeHTML,
   9 )
  10
  11
  12 class UstudioIE(InfoExtractor):
  13     IE_NAME = 'ustudio'
  14     _VALID_URL = r'https?://(?:(?:www|v1)\.)?ustudio\.com/video/(?P<id>[^/]+)/(?P<display_id>[^/?#&]+)'
  15     _TEST = {
  16         'url': 'http://ustudio.com/video/Uxu2my9bgSph/san_francisco_golden_gate_bridge',
  17         'md5': '58bbfca62125378742df01fc2abbdef6',
  18         'info_dict': {
  19             'id': 'Uxu2my9bgSph',
  20             'display_id': 'san_francisco_golden_gate_bridge',
  21             'ext': 'mp4',
  22             'title': 'San Francisco: Golden Gate Bridge',
  23             'description': 'md5:23925500697f2c6d4830e387ba51a9be',
  24             'thumbnail': r're:^https?://.*\.jpg$',
  25             'upload_date': '20111107',
  26             'uploader': 'Tony Farley',
  27         }
  28     }
  29
  30     def _real_extract(self, url):
  31         video_id, display_id = self._match_valid_url(url).groups()
  32
  33         config = self._download_xml(
  34             'http://v1.ustudio.com/embed/%s/ustudio/config.xml' % video_id,
  35             display_id)
  36
  37         def extract(kind):
  38             return [{
  39                 'url': unescapeHTML(item.attrib['url']),
  40                 'width': int_or_none(item.get('width')),
  41                 'height': int_or_none(item.get('height')),
  42             } for item in config.findall('./qualities/quality/%s' % kind) if item.get('url')]
  43
  44         formats = extract('video')
  45         self._sort_formats(formats)
  46
  47         webpage = self._download_webpage(url, display_id)
  48
  49         title = self._og_search_title(webpage)
  50         upload_date = unified_strdate(self._search_regex(
  51             r'(?s)Uploaded by\s*.+?\s*on\s*<span>([^<]+)</span>',
  52             webpage, 'upload date', fatal=False))
  53         uploader = self._search_regex(
  54             r'Uploaded by\s*<a[^>]*>([^<]+)<',
  55             webpage, 'uploader', fatal=False)
  56
  57         return {
  58             'id': video_id,
  59             'display_id': display_id,
  60             'title': title,
  61             'description': self._og_search_description(webpage),
  62             'thumbnails': extract('image'),
  63             'upload_date': upload_date,
  64             'uploader': uploader,
  65             'formats': formats,
  66         }
  67
  68
  69 class UstudioEmbedIE(InfoExtractor):
  70     IE_NAME = 'ustudio:embed'
  71     _VALID_URL = r'https?://(?:(?:app|embed)\.)?ustudio\.com/embed/(?P<uid>[^/]+)/(?P<id>[^/]+)'
  72     _TEST = {
  73         'url': 'http://app.ustudio.com/embed/DeN7VdYRDKhP/Uw7G1kMCe65T',
  74         'md5': '47c0be52a09b23a7f40de9469cec58f4',
  75         'info_dict': {
  76             'id': 'Uw7G1kMCe65T',
  77             'ext': 'mp4',
  78             'title': '5 Things IT Should Know About Video',
  79             'description': 'md5:93d32650884b500115e158c5677d25ad',
  80             'uploader_id': 'DeN7VdYRDKhP',
  81         }
  82     }
  83
  84     def _real_extract(self, url):
  85         uploader_id, video_id = self._match_valid_url(url).groups()
  86         video_data = self._download_json(
  87             'http://app.ustudio.com/embed/%s/%s/config.json' % (uploader_id, video_id),
  88             video_id)['videos'][0]
  89         title = video_data['name']
  90
  91         formats = []
  92         for ext, qualities in video_data.get('transcodes', {}).items():
  93             for quality in qualities:
  94                 quality_url = quality.get('url')
  95                 if not quality_url:
  96                     continue
  97                 height = int_or_none(quality.get('height'))
  98                 formats.append({
  99                     'format_id': '%s-%dp' % (ext, height) if height else ext,
 100                     'url': quality_url,
 101                     'width': int_or_none(quality.get('width')),
 102                     'height': height,
 103                 })
 104         self._sort_formats(formats)
 105
 106         thumbnails = []
 107         for image in video_data.get('images', []):
 108             image_url = image.get('url')
 109             if not image_url:
 110                 continue
 111             thumbnails.append({
 112                 'url': image_url,
 113             })
 114
 115         return {
 116             'id': video_id,
 117             'title': title,
 118             'description': video_data.get('description'),
 119             'duration': int_or_none(video_data.get('duration')),
 120             'uploader_id': uploader_id,
 121             'tags': video_data.get('keywords'),
 122             'thumbnails': thumbnails,
 123             'formats': formats,
 124         }