youtube_dl/extractor/democracynow.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import re
   5 from .common import InfoExtractor
   6
   7
   8 class DemocracynowIE(InfoExtractor):
   9     _VALID_URL = r'https?://(?:www\.)?democracynow.org/?(?P<id>[^\?]*)'
  10     IE_NAME = 'democracynow'
  11     _TESTS = [{
  12         'url': 'http://www.democracynow.org/shows/2015/7/3',
  13         'info_dict': {
  14             'id': '2015-0703-001',
  15             'ext': 'mp4',
  16             'title': 'July 03, 2015 - Democracy Now!',
  17             'description': 'A daily independent global news hour with Amy Goodman & Juan Gonz\xe1lez "What to the Slave is 4th of July?": James Earl Jones Reads Frederick Douglass\u2019 Historic Speech : "This Flag Comes Down Today": Bree Newsome Scales SC Capitol Flagpole, Takes Down Confederate Flag : "We Shall Overcome": Remembering Folk Icon, Activist Pete Seeger in His Own Words & Songs',
  18             'uploader': 'Democracy Now',
  19             'upload_date': None,
  20         },
  21     }, {
  22         'url': 'http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree',
  23         'info_dict': {
  24             'id': '2015-0703-001',
  25             'ext': 'mp4',
  26             'title': '"This Flag Comes Down Today": Bree Newsome Scales SC Capitol Flagpole, Takes Down Confederate Flag',
  27             'description': 'md5:4d2bc4f0d29f5553c2210a4bc7761a21',
  28             'uploader': 'Democracy Now',
  29             'upload_date': None,
  30         },
  31     }]
  32
  33     def _real_extract(self, url):
  34         display_id = self._match_id(url)
  35         base_host = re.search(r'^(.+?://[^/]+)', url).group(1)
  36         if display_id == '':
  37             display_id = 'home'
  38         webpage = self._download_webpage(url, display_id)
  39         description = self._og_search_description(webpage)
  40
  41         jstr = self._search_regex(r'<script[^>]+type="text/json"[^>]*>\s*({[^>]+})', webpage, 'json')
  42         js = self._parse_json(jstr, display_id)
  43         video_id = None
  44         formats = []
  45         subtitles = {}
  46         for key in ('caption_file', '.......'):
  47             # ....... = pending vtt support that doesn't clobber srt 'chapter_file':
  48             url = js.get(key, '')
  49             if url == '' or url is None:
  50                 continue
  51             if not re.match(r'^https?://', url):
  52                 url = base_host + url
  53             ext = re.search(r'\.([^\.]+)$', url).group(1)
  54             subtitles['eng'] = [{
  55                 'ext': ext,
  56                 'url': url,
  57             }]
  58         for key in ('file', 'audio', 'video'):
  59             url = js.get(key, '')
  60             if url == '' or url is None:
  61                 continue
  62             if not re.match(r'^https?://', url):
  63                 url = base_host + url
  64             purl = re.search(r'/(?P<dir>[^/]+)/(?:dn)?(?P<fn>[^/]+?)\.(?P<ext>[^\.\?]+)(?P<hasparams>\?|$)', url)
  65             if video_id is None:
  66                 video_id = purl.group('fn')
  67             if js.get('start') is not None:
  68                 url += '&' if purl.group('hasparams') == '?' else '?'
  69                 url = url + 'start=' + str(js.get('start'))
  70             formats.append({
  71                 'format_id': purl.group('dir'),
  72                 'ext': purl.group('ext'),
  73                 'url': url,
  74             })
  75         self._sort_formats(formats)
  76         ret = {
  77             'id': video_id,
  78             'title': js.get('title'),
  79             'description': description,
  80             'uploader': 'Democracy Now',
  81             'subtitles': subtitles,
  82             'formats': formats,
  83         }
  84         return ret