[yt-dlp.git] / youtube_dl / extractor / democracynow.py

# coding: utf-8
from __future__ import unicode_literals

import re
import os.path

from .common import InfoExtractor
from ..compat import compat_urlparse
from ..utils import (
    url_basename,
    remove_start,
)


class DemocracynowIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?democracynow\.org/(?P<id>[^\?]*)'
    IE_NAME = 'democracynow'
    _TESTS = [{
        'url': 'http://www.democracynow.org/shows/2015/7/3',
        'md5': '3757c182d3d84da68f5c8f506c18c196',
        'info_dict': {
            'id': '2015-0703-001',
            'ext': 'mp4',
            'title': 'Daily Show',
        },
    }, {
        'url': 'http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree',
        'info_dict': {
            'id': '2015-0703-001',
            'ext': 'mp4',
            'title': '"This Flag Comes Down Today": Bree Newsome Scales SC Capitol Flagpole, Takes Down Confederate Flag',
            'description': 'md5:4d2bc4f0d29f5553c2210a4bc7761a21',
        },
        'params': {
            'skip_download': True,
        },
    }]

    def _real_extract(self, url):
        display_id = self._match_id(url)

        webpage = self._download_webpage(url, display_id)

        json_data = self._parse_json(self._search_regex(
            r'<script[^>]+type="text/json"[^>]*>\s*({[^>]+})', webpage, 'json'),
            display_id)

        title = json_data['title']
        formats = []

        video_id = None

        for key in ('file', 'audio', 'video', 'high_res_video'):
            media_url = json_data.get(key, '')
            if not media_url:
                continue
            media_url = re.sub(r'\?.*', '', compat_urlparse.urljoin(url, media_url))
            video_id = video_id or remove_start(os.path.splitext(url_basename(media_url))[0], 'dn')
            formats.append({
                'url': media_url,
                'vcodec': 'none' if key == 'audio' else None,
            })

        self._sort_formats(formats)

        default_lang = 'en'
        subtitles = {}

        def add_subtitle_item(lang, info_dict):
            if lang not in subtitles:
                subtitles[lang] = []
            subtitles[lang].append(info_dict)

        # chapter_file are not subtitles
        if 'caption_file' in json_data:
            add_subtitle_item(default_lang, {
                'url': compat_urlparse.urljoin(url, json_data['caption_file']),
            })

        for subtitle_item in json_data.get('captions', []):
            lang = subtitle_item.get('language', '').lower() or default_lang
            add_subtitle_item(lang, {
                'url': compat_urlparse.urljoin(url, subtitle_item['url']),
            })

        description = self._og_search_description(webpage, default=None)

        return {
            'id': video_id or display_id,
            'title': title,
            'description': description,
            'thumbnail': json_data.get('image'),
            'subtitles': subtitles,
            'formats': formats,
        }
Commit	Line	Data
f8705443	1	# coding: utf-8
	2	from __future__ import unicode_literals
	3
f8705443	4	import re
dde9fe97 YCH	5	import os.path
dde9fe97 YCH	6
f8705443	7	from .common import InfoExtractor
dde9fe97 YCH	8	from ..compat import compat_urlparse
	9	from ..utils import (
	10	url_basename,
	11	remove_start,
	12	)
f8705443	13
	14
	15	class DemocracynowIE(InfoExtractor):
92519402	16	_VALID_URL = r'https?://(?:www\.)?democracynow\.org/(?P<id>[^\?]*)'
f8705443	17	IE_NAME = 'democracynow'
	18	_TESTS = [{
	19	'url': 'http://www.democracynow.org/shows/2015/7/3',
a134426d	20	'md5': '3757c182d3d84da68f5c8f506c18c196',
f8705443	21	'info_dict': {
	22	'id': '2015-0703-001',
	23	'ext': 'mp4',
a134426d	24	'title': 'Daily Show',
f8705443	25	},
eb080813	26	}, {
f8705443	27	'url': 'http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree',
	28	'info_dict': {
	29	'id': '2015-0703-001',
	30	'ext': 'mp4',
	31	'title': '"This Flag Comes Down Today": Bree Newsome Scales SC Capitol Flagpole, Takes Down Confederate Flag',
	32	'description': 'md5:4d2bc4f0d29f5553c2210a4bc7761a21',
f8705443	33	},
a134426d S	34	'params': {
	35	'skip_download': True,
	36	},
f8705443	37	}]
	38
	39	def _real_extract(self, url):
	40	display_id = self._match_id(url)
18da2463	41
f8705443	42	webpage = self._download_webpage(url, display_id)
f8705443	43
fd810282	44	json_data = self._parse_json(self._search_regex(
dde9fe97 YCH	45	r'<script[^>]+type="text/json"[^>]>\s({[^>]+})', webpage, 'json'),
dde9fe97 YCH	46	display_id)
18da2463 S	47
18da2463 S	48	title = json_data['title']
f8705443	49	formats = []
dde9fe97	50
18da2463 S	51	video_id = None
	52
	53	for key in ('file', 'audio', 'video', 'high_res_video'):
	54	media_url = json_data.get(key, '')
	55	if not media_url:
	56	continue
	57	media_url = re.sub(r'\?.*', '', compat_urlparse.urljoin(url, media_url))
	58	video_id = video_id or remove_start(os.path.splitext(url_basename(media_url))[0], 'dn')
	59	formats.append({
	60	'url': media_url,
	61	'vcodec': 'none' if key == 'audio' else None,
	62	})
dde9fe97	63
18da2463 S	64	self._sort_formats(formats)
	65
	66	default_lang = 'en'
f8705443	67	subtitles = {}
dde9fe97 YCH	68
	69	def add_subtitle_item(lang, info_dict):
	70	if lang not in subtitles:
	71	subtitles[lang] = []
	72	subtitles[lang].append(info_dict)
	73
	74	# chapter_file are not subtitles
fd810282	75	if 'caption_file' in json_data:
dde9fe97	76	add_subtitle_item(default_lang, {
fd810282	77	'url': compat_urlparse.urljoin(url, json_data['caption_file']),
dde9fe97 YCH	78	})
dde9fe97 YCH	79
fd810282	80	for subtitle_item in json_data.get('captions', []):
dde9fe97 YCH	81	lang = subtitle_item.get('language', '').lower() or default_lang
	82	add_subtitle_item(lang, {
	83	'url': compat_urlparse.urljoin(url, subtitle_item['url']),
	84	})
	85
18da2463	86	description = self._og_search_description(webpage, default=None)
dde9fe97 YCH	87
dde9fe97 YCH	88	return {
0aeb9a10	89	'id': video_id or display_id,
18da2463	90	'title': title,
f8705443	91	'description': description,
18da2463	92	'thumbnail': json_data.get('image'),
f8705443	93	'subtitles': subtitles,
	94	'formats': formats,
	95	}