youtube_dl/extractor/viidea.py

   1 from __future__ import unicode_literals
   2
   3 import re
   4
   5 from .common import InfoExtractor
   6 from ..compat import (
   7     compat_HTTPError,
   8     compat_urlparse,
   9 )
  10 from ..utils import (
  11     ExtractorError,
  12     parse_duration,
  13     js_to_json,
  14     parse_iso8601,
  15 )
  16
  17
  18 class ViideaIE(InfoExtractor):
  19     _VALID_URL = r'''(?x)http://(?:www\.)?(?:
  20             videolectures\.net|
  21             flexilearn\.viidea\.net|
  22             presentations\.ocwconsortium\.org|
  23             video\.travel-zoom\.si|
  24             video\.pomp-forum\.si|
  25             tv\.nil\.si|
  26             video\.hekovnik.com|
  27             video\.szko\.si|
  28             kpk\.viidea\.com|
  29             inside\.viidea\.net|
  30             video\.kiberpipa\.org|
  31             bvvideo\.si|
  32             kongres\.viidea\.net|
  33             edemokracija\.viidea\.com
  34         )(?:/lecture)?/(?P<id>[^/]+)(?:/video/(?P<part>\d+))?'''
  35
  36     _TESTS = [{
  37         'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/',
  38         'info_dict': {
  39             'id': '20171_part1',
  40             'ext': 'mp4',
  41             'title': 'Automatics, robotics and biocybernetics',
  42             'description': 'md5:815fc1deb6b3a2bff99de2d5325be482',
  43             'upload_date': '20130627',
  44             'duration': 565,
  45             'thumbnail': 're:http://.*\.jpg',
  46         },
  47     }, {
  48         # video with invalid direct format links (HTTP 403)
  49         'url': 'http://videolectures.net/russir2010_filippova_nlp/',
  50         'info_dict': {
  51             'id': '14891_part1',
  52             'ext': 'flv',
  53             'title': 'NLP at Google',
  54             'description': 'md5:fc7a6d9bf0302d7cc0e53f7ca23747b3',
  55             'duration': 5352,
  56             'thumbnail': 're:http://.*\.jpg',
  57         },
  58         'params': {
  59             # rtmp download
  60             'skip_download': True,
  61         },
  62     }, {
  63         'url': 'http://videolectures.net/deeplearning2015_montreal/',
  64         'info_dict': {
  65             'id': '23181',
  66             'title': 'Deep Learning Summer School, Montreal 2015',
  67             'description': 'md5:0533a85e4bd918df52a01f0e1ebe87b7',
  68             'timestamp': 1438560000,
  69         },
  70         'playlist_count': 30,
  71     }, {
  72         # multi part lecture
  73         'url': 'http://videolectures.net/mlss09uk_bishop_ibi/',
  74         'info_dict': {
  75             'id': '9737',
  76             'title': 'Introduction To Bayesian Inference',
  77             'timestamp': 1251622800,
  78         },
  79         'playlist': [{
  80             'info_dict': {
  81                 'id': '9737_part1',
  82                 'ext': 'wmv',
  83                 'title': 'Introduction To Bayesian Inference',
  84             },
  85         }, {
  86             'info_dict': {
  87                 'id': '9737_part2',
  88                 'ext': 'wmv',
  89                 'title': 'Introduction To Bayesian Inference',
  90             },
  91         }],
  92         'playlist_count': 2,
  93     }]
  94
  95     def _real_extract(self, url):
  96         lecture_slug, part = re.match(self._VALID_URL, url).groups()
  97
  98         webpage = self._download_webpage(url, lecture_slug)
  99
 100         cfg = self._parse_json(self._search_regex(r'cfg\s*:\s*({[^}]+})', webpage, 'cfg'), lecture_slug, js_to_json)
 101
 102         lecture_id = str(cfg['obj_id'])
 103
 104         base_url = self._proto_relative_url(cfg['livepipe'], 'http:')
 105
 106         lecture_data = self._download_json('%s/site/api/lecture/%s?format=json' % (base_url, lecture_id), lecture_id)['lecture'][0]
 107
 108         lecture_info = {
 109             'id': lecture_id,
 110             'display_id': lecture_slug,
 111             'title': lecture_data['title'],
 112             'timestamp': parse_iso8601(lecture_data.get('time')),
 113             'description': lecture_data.get('description_wiki'),
 114             'thumbnail': lecture_data.get('thumb'),
 115         }
 116
 117         entries = []
 118         parts = cfg.get('videos')
 119         if parts:
 120             if len(parts) == 1:
 121                 part = str(parts[0])
 122             if part:
 123                 smil_url = '%s/%s/video/%s/smil.xml' % (base_url, lecture_slug, part)
 124                 smil = self._download_smil(smil_url, lecture_id)
 125                 info = self._parse_smil(smil, smil_url, lecture_id)
 126                 info['id'] = '%s_part%s' % (lecture_id, part)
 127                 switch = smil.find('.//switch')
 128                 if switch is not None:
 129                     info['duration'] = parse_duration(switch.attrib.get('dur'))
 130                 return info
 131             else:
 132                 for part in parts:
 133                     entries.append(self.url_result('%s/video/%s' % (base_url, lecture_id, part), 'Viidea'))
 134                 lecture_info['_type'] = 'multi_video'
 135         else:
 136             # Probably a playlist
 137             playlist_webpage = self._download_webpage('%s/site/ajax/drilldown/?id=%s' % (base_url, lecture_id), lecture_id)
 138             entries = [
 139                 self.url_result(compat_urlparse.urljoin(url, video_url), 'Viidea')
 140                 for _, video_url in re.findall(r'<a[^>]+href=(["\'])(.+?)\1[^>]+id=["\']lec=\d+', playlist_webpage)]
 141             lecture_info['_type'] = 'playlist'
 142
 143         lecture_info['entries'] = entries
 144         return lecture_info