[yt-dlp.git] / yt_dlp / extractor / jove.py

from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..utils import (
    ExtractorError,
    unified_strdate
)


class JoveIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?jove\.com/video/(?P<id>[0-9]+)'
    _CHAPTERS_URL = 'http://www.jove.com/video-chapters?videoid={video_id:}'
    _TESTS = [
        {
            'url': 'http://www.jove.com/video/2744/electrode-positioning-montage-transcranial-direct-current',
            'md5': '93723888d82dbd6ba8b3d7d0cd65dd2b',
            'info_dict': {
                'id': '2744',
                'ext': 'mp4',
                'title': 'Electrode Positioning and Montage in Transcranial Direct Current Stimulation',
                'description': 'md5:015dd4509649c0908bc27f049e0262c6',
                'thumbnail': r're:^https?://.*\.png$',
                'upload_date': '20110523',
            }
        },
        {
            'url': 'http://www.jove.com/video/51796/culturing-caenorhabditis-elegans-axenic-liquid-media-creation',
            'md5': '914aeb356f416811d911996434811beb',
            'info_dict': {
                'id': '51796',
                'ext': 'mp4',
                'title': 'Culturing Caenorhabditis elegans in Axenic Liquid Media and Creation of Transgenic Worms by Microparticle Bombardment',
                'description': 'md5:35ff029261900583970c4023b70f1dc9',
                'thumbnail': r're:^https?://.*\.png$',
                'upload_date': '20140802',
            }
        },

    ]

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        video_id = mobj.group('id')

        webpage = self._download_webpage(url, video_id)

        chapters_id = self._html_search_regex(
            r'/video-chapters\?videoid=([0-9]+)', webpage, 'chapters id')

        chapters_xml = self._download_xml(
            self._CHAPTERS_URL.format(video_id=chapters_id),
            video_id, note='Downloading chapters XML',
            errnote='Failed to download chapters XML')

        video_url = chapters_xml.attrib.get('video')
        if not video_url:
            raise ExtractorError('Failed to get the video URL')

        title = self._html_search_meta('citation_title', webpage, 'title')
        thumbnail = self._og_search_thumbnail(webpage)
        description = self._html_search_regex(
            r'<div id="section_body_summary"><p class="jove_content">(.+?)</p>',
            webpage, 'description', fatal=False)
        publish_date = unified_strdate(self._html_search_meta(
            'citation_publication_date', webpage, 'publish date', fatal=False))
        comment_count = int(self._html_search_regex(
            r'<meta name="num_comments" content="(\d+) Comments?"',
            webpage, 'comment count', fatal=False))

        return {
            'id': video_id,
            'title': title,
            'url': video_url,
            'thumbnail': thumbnail,
            'description': description,
            'upload_date': publish_date,
            'comment_count': comment_count,
        }
Commit	Line	Data
a229909f NJ	1	from __future__ import unicode_literals
	2
	3	import re
a229909f NJ	4
a229909f NJ	5	from .common import InfoExtractor
fe556f1b S	6	from ..utils import (
	7	ExtractorError,
	8	unified_strdate
	9	)
a229909f NJ	10
	11
	12	class JoveIE(InfoExtractor):
	13	_VALID_URL = r'https?://(?:www\.)?jove\.com/video/(?P<id>[0-9]+)'
	14	_CHAPTERS_URL = 'http://www.jove.com/video-chapters?videoid={video_id:}'
fe556f1b S	15	_TESTS = [
	16	{
	17	'url': 'http://www.jove.com/video/2744/electrode-positioning-montage-transcranial-direct-current',
	18	'md5': '93723888d82dbd6ba8b3d7d0cd65dd2b',
	19	'info_dict': {
	20	'id': '2744',
	21	'ext': 'mp4',
	22	'title': 'Electrode Positioning and Montage in Transcranial Direct Current Stimulation',
	23	'description': 'md5:015dd4509649c0908bc27f049e0262c6',
ec85ded8	24	'thumbnail': r're:^https?://.*\.png$',
fe556f1b S	25	'upload_date': '20110523',
	26	}
	27	},
	28	{
	29	'url': 'http://www.jove.com/video/51796/culturing-caenorhabditis-elegans-axenic-liquid-media-creation',
	30	'md5': '914aeb356f416811d911996434811beb',
	31	'info_dict': {
	32	'id': '51796',
	33	'ext': 'mp4',
	34	'title': 'Culturing Caenorhabditis elegans in Axenic Liquid Media and Creation of Transgenic Worms by Microparticle Bombardment',
	35	'description': 'md5:35ff029261900583970c4023b70f1dc9',
ec85ded8	36	'thumbnail': r're:^https?://.*\.png$',
fe556f1b S	37	'upload_date': '20140802',
	38	}
	39	},
	40
	41	]
a229909f NJ	42
	43	def _real_extract(self, url):
	44	mobj = re.match(self._VALID_URL, url)
	45	video_id = mobj.group('id')
	46
	47	webpage = self._download_webpage(url, video_id)
a229909f	48
a229909f NJ	49	chapters_id = self._html_search_regex(
a229909f NJ	50	r'/video-chapters\?videoid=([0-9]+)', webpage, 'chapters id')
fe556f1b	51
a229909f NJ	52	chapters_xml = self._download_xml(
a229909f NJ	53	self._CHAPTERS_URL.format(video_id=chapters_id),
fe556f1b S	54	video_id, note='Downloading chapters XML',
	55	errnote='Failed to download chapters XML')
	56
a229909f NJ	57	video_url = chapters_xml.attrib.get('video')
	58	if not video_url:
	59	raise ExtractorError('Failed to get the video URL')
	60
fe556f1b S	61	title = self._html_search_meta('citation_title', webpage, 'title')
	62	thumbnail = self._og_search_thumbnail(webpage)
	63	description = self._html_search_regex(
	64	r'<div id="section_body_summary"><p class="jove_content">(.+?)</p>',
	65	webpage, 'description', fatal=False)
	66	publish_date = unified_strdate(self._html_search_meta(
	67	'citation_publication_date', webpage, 'publish date', fatal=False))
34646967	68	comment_count = int(self._html_search_regex(
fe556f1b	69	r'<meta name="num_comments" content="(\d+) Comments?"',
34646967	70	webpage, 'comment count', fatal=False))
a229909f NJ	71
	72	return {
	73	'id': video_id,
	74	'title': title,
	75	'url': video_url,
a229909f NJ	76	'thumbnail': thumbnail,
	77	'description': description,
	78	'upload_date': publish_date,
fe556f1b	79	'comment_count': comment_count,
a229909f	80	}