[yt-dlp.git] / yt_dlp / extractor / canalc2.py

# coding: utf-8
from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..utils import parse_duration


class Canalc2IE(InfoExtractor):
    IE_NAME = 'canalc2.tv'
    _VALID_URL = r'https?://(?:(?:www\.)?canalc2\.tv/video/|archives-canalc2\.u-strasbg\.fr/video\.asp\?.*\bidVideo=)(?P<id>\d+)'

    _TESTS = [{
        'url': 'http://www.canalc2.tv/video/12163',
        'md5': '060158428b650f896c542dfbb3d6487f',
        'info_dict': {
            'id': '12163',
            'ext': 'mp4',
            'title': 'Terrasses du Numérique',
            'duration': 122,
        },
    }, {
        'url': 'http://archives-canalc2.u-strasbg.fr/video.asp?idVideo=11427&voir=oui',
        'only_matching': True,
    }]

    def _real_extract(self, url):
        video_id = self._match_id(url)

        webpage = self._download_webpage(
            'http://www.canalc2.tv/video/%s' % video_id, video_id)

        title = self._html_search_regex(
            r'(?s)class="[^"]*col_description[^"]*">.*?<h3>(.+?)</h3>',
            webpage, 'title')

        formats = []
        for _, video_url in re.findall(r'file\s*=\s*(["\'])(.+?)\1', webpage):
            if video_url.startswith('rtmp://'):
                rtmp = re.search(
                    r'^(?P<url>rtmp://[^/]+/(?P<app>.+/))(?P<play_path>mp4:.+)$', video_url)
                formats.append({
                    'url': rtmp.group('url'),
                    'format_id': 'rtmp',
                    'ext': 'flv',
                    'app': rtmp.group('app'),
                    'play_path': rtmp.group('play_path'),
                    'page_url': url,
                })
            else:
                formats.append({
                    'url': video_url,
                    'format_id': 'http',
                })

        if formats:
            info = {
                'formats': formats,
            }
        else:
            info = self._parse_html5_media_entries(url, webpage, url)[0]

        self._sort_formats(info['formats'])

        info.update({
            'id': video_id,
            'title': title,
            'duration': parse_duration(self._search_regex(
                r'id=["\']video_duree["\'][^>]*>([^<]+)',
                webpage, 'duration', fatal=False)),
        })
        return info
Commit	Line	Data
cd0abcc0	1	# coding: utf-8
0568c352 PH	2	from __future__ import unicode_literals
0568c352 PH	3
cd0abcc0	4	import re
cd0abcc0 PR	5
cd0abcc0 PR	6	from .common import InfoExtractor
b1bf0635	7	from ..utils import parse_duration
cd0abcc0	8
e86ea47c	9
cd0abcc0	10	class Canalc2IE(InfoExtractor):
6b361ad5	11	IE_NAME = 'canalc2.tv'
7a34302e	12	_VALID_URL = r'https?://(?:(?:www\.)?canalc2\.tv/video/\|archives-canalc2\.u-strasbg\.fr/video\.asp\?.*\bidVideo=)(?P<id>\d+)'
cd0abcc0	13
7a34302e	14	_TESTS = [{
b0f001a6	15	'url': 'http://www.canalc2.tv/video/12163',
0568c352 PH	16	'md5': '060158428b650f896c542dfbb3d6487f',
	17	'info_dict': {
	18	'id': '12163',
dde97ea8	19	'ext': 'mp4',
608945d4 S	20	'title': 'Terrasses du Numérique',
608945d4 S	21	'duration': 122,
b0f001a6	22	},
7a34302e S	23	}, {
	24	'url': 'http://archives-canalc2.u-strasbg.fr/video.asp?idVideo=11427&voir=oui',
	25	'only_matching': True,
	26	}]
cd0abcc0 PR	27
cd0abcc0 PR	28	def _real_extract(self, url):
b0f001a6	29	video_id = self._match_id(url)
7a34302e S	30
	31	webpage = self._download_webpage(
	32	'http://www.canalc2.tv/video/%s' % video_id, video_id)
	33
38f59e27 S	34	title = self._html_search_regex(
	35	r'(?s)class="[^"]col_description[^"]">.*?<h3>(.+?)</h3>',
	36	webpage, 'title')
	37
7a34302e S	38	formats = []
	39	for _, video_url in re.findall(r'file\s=\s(["\'])(.+?)\1', webpage):
	40	if video_url.startswith('rtmp://'):
	41	rtmp = re.search(
	42	r'^(?P<url>rtmp://[^/]+/(?P<app>.+/))(?P<play_path>mp4:.+)$', video_url)
	43	formats.append({
	44	'url': rtmp.group('url'),
	45	'format_id': 'rtmp',
	46	'ext': 'flv',
	47	'app': rtmp.group('app'),
	48	'play_path': rtmp.group('play_path'),
	49	'page_url': url,
	50	})
	51	else:
	52	formats.append({
	53	'url': video_url,
	54	'format_id': 'http',
	55	})
ff242459	56
38f59e27 S	57	if formats:
	58	info = {
	59	'formats': formats,
	60	}
	61	else:
	62	info = self._parse_html5_media_entries(url, webpage, url)[0]
	63
	64	self._sort_formats(info['formats'])
0568c352	65
38f59e27	66	info.update({
0568c352	67	'id': video_id,
0568c352	68	'title': title,
38f59e27 S	69	'duration': parse_duration(self._search_regex(
	70	r'id=["\']video_duree["\'][^>]*>([^<]+)',
	71	webpage, 'duration', fatal=False)),
	72	})
	73	return info