[yt-dlp.git] / yt_dlp / extractor / jove.py

from __future__ import unicode_literals


from .common import InfoExtractor
from ..utils import (
    ExtractorError,
    unified_strdate
)


class JoveIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?jove\.com/video/(?P<id>[0-9]+)'
    _CHAPTERS_URL = 'http://www.jove.com/video-chapters?videoid={video_id:}'
    _TESTS = [
        {
            'url': 'http://www.jove.com/video/2744/electrode-positioning-montage-transcranial-direct-current',
            'md5': '93723888d82dbd6ba8b3d7d0cd65dd2b',
            'info_dict': {
                'id': '2744',
                'ext': 'mp4',
                'title': 'Electrode Positioning and Montage in Transcranial Direct Current Stimulation',
                'description': 'md5:015dd4509649c0908bc27f049e0262c6',
                'thumbnail': r're:^https?://.*\.png$',
                'upload_date': '20110523',
            }
        },
        {
            'url': 'http://www.jove.com/video/51796/culturing-caenorhabditis-elegans-axenic-liquid-media-creation',
            'md5': '914aeb356f416811d911996434811beb',
            'info_dict': {
                'id': '51796',
                'ext': 'mp4',
                'title': 'Culturing Caenorhabditis elegans in Axenic Liquid Media and Creation of Transgenic Worms by Microparticle Bombardment',
                'description': 'md5:35ff029261900583970c4023b70f1dc9',
                'thumbnail': r're:^https?://.*\.png$',
                'upload_date': '20140802',
            }
        },

    ]

    def _real_extract(self, url):
        mobj = self._match_valid_url(url)
        video_id = mobj.group('id')

        webpage = self._download_webpage(url, video_id)

        chapters_id = self._html_search_regex(
            r'/video-chapters\?videoid=([0-9]+)', webpage, 'chapters id')

        chapters_xml = self._download_xml(
            self._CHAPTERS_URL.format(video_id=chapters_id),
            video_id, note='Downloading chapters XML',
            errnote='Failed to download chapters XML')

        video_url = chapters_xml.attrib.get('video')
        if not video_url:
            raise ExtractorError('Failed to get the video URL')

        title = self._html_search_meta('citation_title', webpage, 'title')
        thumbnail = self._og_search_thumbnail(webpage)
        description = self._html_search_regex(
            r'<div id="section_body_summary"><p class="jove_content">(.+?)</p>',
            webpage, 'description', fatal=False)
        publish_date = unified_strdate(self._html_search_meta(
            'citation_publication_date', webpage, 'publish date', fatal=False))
        comment_count = int(self._html_search_regex(
            r'<meta name="num_comments" content="(\d+) Comments?"',
            webpage, 'comment count', fatal=False))

        return {
            'id': video_id,
            'title': title,
            'url': video_url,
            'thumbnail': thumbnail,
            'description': description,
            'upload_date': publish_date,
            'comment_count': comment_count,
        }
Commit	Line	Data
a229909f NJ	1	from __future__ import unicode_literals
a229909f NJ	2
a229909f NJ	3
a229909f NJ	4	from .common import InfoExtractor
fe556f1b S	5	from ..utils import (
	6	ExtractorError,
	7	unified_strdate
	8	)
a229909f NJ	9
	10
	11	class JoveIE(InfoExtractor):
	12	_VALID_URL = r'https?://(?:www\.)?jove\.com/video/(?P<id>[0-9]+)'
	13	_CHAPTERS_URL = 'http://www.jove.com/video-chapters?videoid={video_id:}'
fe556f1b S	14	_TESTS = [
	15	{
	16	'url': 'http://www.jove.com/video/2744/electrode-positioning-montage-transcranial-direct-current',
	17	'md5': '93723888d82dbd6ba8b3d7d0cd65dd2b',
	18	'info_dict': {
	19	'id': '2744',
	20	'ext': 'mp4',
	21	'title': 'Electrode Positioning and Montage in Transcranial Direct Current Stimulation',
	22	'description': 'md5:015dd4509649c0908bc27f049e0262c6',
ec85ded8	23	'thumbnail': r're:^https?://.*\.png$',
fe556f1b S	24	'upload_date': '20110523',
	25	}
	26	},
	27	{
	28	'url': 'http://www.jove.com/video/51796/culturing-caenorhabditis-elegans-axenic-liquid-media-creation',
	29	'md5': '914aeb356f416811d911996434811beb',
	30	'info_dict': {
	31	'id': '51796',
	32	'ext': 'mp4',
	33	'title': 'Culturing Caenorhabditis elegans in Axenic Liquid Media and Creation of Transgenic Worms by Microparticle Bombardment',
	34	'description': 'md5:35ff029261900583970c4023b70f1dc9',
ec85ded8	35	'thumbnail': r're:^https?://.*\.png$',
fe556f1b S	36	'upload_date': '20140802',
	37	}
	38	},
	39
	40	]
a229909f NJ	41
a229909f NJ	42	def _real_extract(self, url):
5ad28e7f	43	mobj = self._match_valid_url(url)
a229909f NJ	44	video_id = mobj.group('id')
	45
	46	webpage = self._download_webpage(url, video_id)
a229909f	47
a229909f NJ	48	chapters_id = self._html_search_regex(
a229909f NJ	49	r'/video-chapters\?videoid=([0-9]+)', webpage, 'chapters id')
fe556f1b	50
a229909f NJ	51	chapters_xml = self._download_xml(
a229909f NJ	52	self._CHAPTERS_URL.format(video_id=chapters_id),
fe556f1b S	53	video_id, note='Downloading chapters XML',
	54	errnote='Failed to download chapters XML')
	55
a229909f NJ	56	video_url = chapters_xml.attrib.get('video')
	57	if not video_url:
	58	raise ExtractorError('Failed to get the video URL')
	59
fe556f1b S	60	title = self._html_search_meta('citation_title', webpage, 'title')
	61	thumbnail = self._og_search_thumbnail(webpage)
	62	description = self._html_search_regex(
	63	r'<div id="section_body_summary"><p class="jove_content">(.+?)</p>',
	64	webpage, 'description', fatal=False)
	65	publish_date = unified_strdate(self._html_search_meta(
	66	'citation_publication_date', webpage, 'publish date', fatal=False))
34646967	67	comment_count = int(self._html_search_regex(
fe556f1b	68	r'<meta name="num_comments" content="(\d+) Comments?"',
34646967	69	webpage, 'comment count', fatal=False))
a229909f NJ	70
	71	return {
	72	'id': video_id,
	73	'title': title,
	74	'url': video_url,
a229909f NJ	75	'thumbnail': thumbnail,
	76	'description': description,
	77	'upload_date': publish_date,
fe556f1b	78	'comment_count': comment_count,
a229909f	79	}