youtube_dl/extractor/trilulilu.py

   1 import json
   2 import re
   3 import xml.etree.ElementTree
   4
   5 from .common import InfoExtractor
   6 from ..utils import (
   7     ExtractorError,
   8 )
   9
  10
  11 class TriluliluIE(InfoExtractor):
  12     _VALID_URL = r'(?x)(?:https?://)?(?:www\.)?trilulilu\.ro/video-(?P<category>[^/]+)/(?P<video_id>[^/]+)'
  13     _TEST = {
  14         u"url": u"http://www.trilulilu.ro/video-animatie/big-buck-bunny-1",
  15         u'file': u"big-buck-bunny-1.mp4",
  16         u'info_dict': {
  17             u"title": u"Big Buck Bunny",
  18             u"description": u":) pentru copilul din noi",
  19         },
  20         # Server ignores Range headers (--test)
  21         u"params": {
  22             u"skip_download": True
  23         }
  24     }
  25
  26     def _real_extract(self, url):
  27         mobj = re.match(self._VALID_URL, url)
  28         video_id = mobj.group('video_id')
  29
  30         webpage = self._download_webpage(url, video_id)
  31
  32         title = self._og_search_title(webpage)
  33         thumbnail = self._og_search_thumbnail(webpage)
  34         description = self._og_search_description(webpage)
  35
  36         log_str = self._search_regex(
  37             r'block_flash_vars[ ]=[ ]({[^}]+})', webpage, u'log info')
  38         log = json.loads(log_str)
  39
  40         format_url = (u'http://fs%(server)s.trilulilu.ro/%(hash)s/'
  41                       u'video-formats2' % log)
  42         format_str = self._download_webpage(
  43             format_url, video_id,
  44             note=u'Downloading formats',
  45             errnote=u'Error while downloading formats')
  46
  47         format_doc = xml.etree.ElementTree.fromstring(format_str)
  48
  49         video_url_template = (
  50             u'http://fs%(server)s.trilulilu.ro/stream.php?type=video'
  51             u'&source=site&hash=%(hash)s&username=%(userid)s&'
  52             u'key=ministhebest&format=%%s&sig=&exp=' %
  53             log)
  54         formats = [
  55             {
  56                 'format': fnode.text,
  57                 'url': video_url_template % fnode.text,
  58             }
  59
  60             for fnode in format_doc.findall('./formats/format')
  61         ]
  62
  63         info = {
  64             '_type': 'video',
  65             'id': video_id,
  66             'formats': formats,
  67             'title': title,
  68             'description': description,
  69             'thumbnail': thumbnail,
  70         }
  71
  72         # TODO: Remove when #980 has been merged
  73         info['url'] = formats[-1]['url']
  74         info['ext'] = formats[-1]['format'].partition('-')[0]
  75
  76         return info