yt_dlp/extractor/golem.py

   1 import urllib.parse
   2
   3 from .common import InfoExtractor
   4 from ..utils import (
   5     determine_ext,
   6 )
   7
   8
   9 class GolemIE(InfoExtractor):
  10     _VALID_URL = r'^https?://video\.golem\.de/.+?/(?P<id>.+?)/'
  11     _TEST = {
  12         'url': 'http://video.golem.de/handy/14095/iphone-6-und-6-plus-test.html',
  13         'md5': 'c1a2c0a3c863319651c7c992c5ee29bf',
  14         'info_dict': {
  15             'id': '14095',
  16             'format_id': 'high',
  17             'ext': 'mp4',
  18             'title': 'iPhone 6 und 6 Plus - Test',
  19             'duration': 300.44,
  20             'filesize': 65309548,
  21         },
  22     }
  23
  24     _PREFIX = 'http://video.golem.de'
  25
  26     def _real_extract(self, url):
  27         video_id = self._match_id(url)
  28
  29         config = self._download_xml(
  30             f'https://video.golem.de/xml/{video_id}.xml', video_id)
  31
  32         info = {
  33             'id': video_id,
  34             'title': config.findtext('./title', 'golem'),
  35             'duration': self._float(config.findtext('./playtime'), 'duration'),
  36         }
  37
  38         formats = []
  39         for e in config:
  40             url = e.findtext('./url')
  41             if not url:
  42                 continue
  43
  44             formats.append({
  45                 'format_id': str(e.tag),
  46                 'url': urllib.parse.urljoin(self._PREFIX, url),
  47                 'height': self._int(e.get('height'), 'height'),
  48                 'width': self._int(e.get('width'), 'width'),
  49                 'filesize': self._int(e.findtext('filesize'), 'filesize'),
  50                 'ext': determine_ext(e.findtext('./filename')),
  51             })
  52         info['formats'] = formats
  53
  54         thumbnails = []
  55         for e in config.findall('.//teaser'):
  56             url = e.findtext('./url')
  57             if not url:
  58                 continue
  59             thumbnails.append({
  60                 'url': urllib.parse.urljoin(self._PREFIX, url),
  61                 'width': self._int(e.get('width'), 'thumbnail width'),
  62                 'height': self._int(e.get('height'), 'thumbnail height'),
  63             })
  64         info['thumbnails'] = thumbnails
  65
  66         return info