[yt-dlp.git] / youtube_dl / extractor / wat.py

# coding: utf-8
from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
    ExtractorError,
    unified_strdate,
    HEADRequest,
    int_or_none,
)


class WatIE(InfoExtractor):
    _VALID_URL = r'(?:wat:|https?://(?:www\.)?wat\.tv/video/.*-)(?P<id>[0-9a-z]+)'
    IE_NAME = 'wat.tv'
    _TESTS = [
        {
            'url': 'http://www.wat.tv/video/soupe-figues-l-orange-aux-epices-6z1uz_2hvf7_.html',
            'md5': '83d882d9de5c9d97f0bb2c6273cde56a',
            'info_dict': {
                'id': '11713067',
                'ext': 'mp4',
                'title': 'Soupe de figues à l\'orange et aux épices',
                'description': 'Retrouvez l\'émission "Petits plats en équilibre", diffusée le 18 août 2014.',
                'upload_date': '20140819',
                'duration': 120,
            },
        },
        {
            'url': 'http://www.wat.tv/video/gregory-lemarchal-voix-ange-6z1v7_6ygkj_.html',
            'md5': '34bdfa5ca9fd3c7eb88601b635b0424c',
            'info_dict': {
                'id': '11713075',
                'ext': 'mp4',
                'title': 'Grégory Lemarchal, une voix d\'ange depuis 10 ans (1/3)',
                'upload_date': '20140816',
            },
            'expected_warnings': ["Ce contenu n'est pas disponible pour l'instant."],
        },
    ]

    _FORMATS = (
        (200, 416, 234),
        (400, 480, 270),
        (600, 640, 360),
        (1200, 640, 360),
        (1800, 960, 540),
        (2500, 1280, 720),
    )

    def _real_extract(self, url):
        video_id = self._match_id(url)
        video_id = video_id if video_id.isdigit() and len(video_id) > 6 else compat_str(int(video_id, 36))

        # 'contentv4' is used in the website, but it also returns the related
        # videos, we don't need them
        video_data = self._download_json(
            'http://www.wat.tv/interface/contentv4s/' + video_id, video_id)
        video_info = video_data['media']

        error_desc = video_info.get('error_desc')
        if error_desc:
            self.report_warning(
                '%s returned error: %s' % (self.IE_NAME, error_desc))

        chapters = video_info['chapters']
        if chapters:
            first_chapter = chapters[0]

            def video_id_for_chapter(chapter):
                return chapter['tc_start'].split('-')[0]

            if video_id_for_chapter(first_chapter) != video_id:
                self.to_screen('Multipart video detected')
                entries = [self.url_result('wat:%s' % video_id_for_chapter(chapter)) for chapter in chapters]
                return self.playlist_result(entries, video_id, video_info['title'])
            # Otherwise we can continue and extract just one part, we have to use
            # the video id for getting the video url
        else:
            first_chapter = video_info

        title = first_chapter['title']

        def extract_url(path_template, url_type):
            req_url = 'http://www.wat.tv/get/%s' % (path_template % video_id)
            head = self._request_webpage(HEADRequest(req_url), video_id, 'Extracting %s url' % url_type)
            red_url = head.geturl()
            if req_url == red_url:
                raise ExtractorError(
                    '%s said: Sorry, this video is not available from your country.' % self.IE_NAME,
                    expected=True)
            return red_url

        formats = []
        try:
            http_url = extract_url('android5/%s.mp4', 'http')
            m3u8_url = extract_url('ipad/%s.m3u8', 'm3u8')
            m3u8_formats = self._extract_m3u8_formats(
                m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls')
            formats.extend(m3u8_formats)
            formats.extend(self._extract_f4m_formats(
                m3u8_url.replace('ios.', 'web.').replace('.m3u8', '.f4m'),
                video_id, f4m_id='hds', fatal=False))
            for m3u8_format in m3u8_formats:
                vbr, abr = m3u8_format.get('vbr'), m3u8_format.get('abr')
                if not vbr or not abr:
                    continue
                format_id = m3u8_format['format_id'].replace('hls', 'http')
                fmt_url = re.sub(r'%s-\d+00-\d+' % video_id, '%s-%d00-%d' % (video_id, round(vbr / 100), round(abr)), http_url)
                if self._is_valid_url(fmt_url, video_id, format_id):
                    f = m3u8_format.copy()
                    f.update({
                        'url': fmt_url,
                        'format_id': format_id,
                        'protocol': 'http',
                    })
                    formats.append(f)
            self._sort_formats(formats)
        except ExtractorError:
            abr = 64
            for vbr, width, height in self._FORMATS:
                tbr = vbr + abr
                format_id = 'http-%s' % tbr
                fmt_url = 'http://dnl.adv.tf1.fr/2/USP-0x0/%s/%s/%s/ssm/%s-%s-64k.mp4' % (video_id[-4:-2], video_id[-2:], video_id, video_id, vbr)
                if self._is_valid_url(fmt_url, video_id, format_id):
                    formats.append({
                        'format_id': format_id,
                        'url': fmt_url,
                        'vbr': vbr,
                        'abr': abr,
                        'width': width,
                        'height': height,
                    })

        date_diffusion = first_chapter.get('date_diffusion') or video_data.get('configv4', {}).get('estatS4')
        upload_date = unified_strdate(date_diffusion) if date_diffusion else None
        duration = None
        files = video_info['files']
        if files:
            duration = int_or_none(files[0].get('duration'))

        return {
            'id': video_id,
            'title': title,
            'thumbnail': first_chapter.get('preview'),
            'description': first_chapter.get('description'),
            'view_count': int_or_none(video_info.get('views')),
            'upload_date': upload_date,
            'duration': duration,
            'formats': formats,
        }
Commit	Line	Data
8244288d	1	# coding: utf-8
e7916255	2	from __future__ import unicode_literals
8244288d	3
99afb3dd JMF	4	import re
	5
	6	from .common import InfoExtractor
c5f51551	7	from ..compat import compat_str
86916dae S	8	from ..utils import (
	9	ExtractorError,
	10	unified_strdate,
c5f51551	11	HEADRequest,
57ce8a6d	12	int_or_none,
86916dae	13	)
99afb3dd JMF	14
	15
	16	class WatIE(InfoExtractor):
c5f51551	17	_VALID_URL = r'(?:wat:\|https?://(?:www\.)?wat\.tv/video/.*-)(?P<id>[0-9a-z]+)'
99afb3dd	18	IE_NAME = 'wat.tv'
c28df247 S	19	_TESTS = [
	20	{
	21	'url': 'http://www.wat.tv/video/soupe-figues-l-orange-aux-epices-6z1uz_2hvf7_.html',
c5f51551	22	'md5': '83d882d9de5c9d97f0bb2c6273cde56a',
c28df247 S	23	'info_dict': {
c28df247 S	24	'id': '11713067',
c28df247 S	25	'ext': 'mp4',
	26	'title': 'Soupe de figues à l\'orange et aux épices',
	27	'description': 'Retrouvez l\'émission "Petits plats en équilibre", diffusée le 18 août 2014.',
	28	'upload_date': '20140819',
	29	'duration': 120,
	30	},
	31	},
	32	{
	33	'url': 'http://www.wat.tv/video/gregory-lemarchal-voix-ange-6z1v7_6ygkj_.html',
57ce8a6d	34	'md5': '34bdfa5ca9fd3c7eb88601b635b0424c',
c28df247 S	35	'info_dict': {
c28df247 S	36	'id': '11713075',
c28df247 S	37	'ext': 'mp4',
c28df247 S	38	'title': 'Grégory Lemarchal, une voix d\'ange depuis 10 ans (1/3)',
c28df247	39	'upload_date': '20140816',
c28df247	40	},
57ce8a6d	41	'expected_warnings': ["Ce contenu n'est pas disponible pour l'instant."],
fa800269	42	},
c28df247	43	]
e7916255	44
57ce8a6d RA	45	_FORMATS = (
	46	(200, 416, 234),
	47	(400, 480, 270),
	48	(600, 640, 360),
	49	(1200, 640, 360),
	50	(1800, 960, 540),
	51	(2500, 1280, 720),
	52	)
	53
99afb3dd	54	def _real_extract(self, url):
c5f51551	55	video_id = self._match_id(url)
c5f51551	56	video_id = video_id if video_id.isdigit() and len(video_id) > 6 else compat_str(int(video_id, 36))
8244288d	57
c5f51551	58	# 'contentv4' is used in the website, but it also returns the related
c5f51551	59	# videos, we don't need them
57ce8a6d RA	60	video_data = self._download_json(
	61	'http://www.wat.tv/interface/contentv4s/' + video_id, video_id)
	62	video_info = video_data['media']
a54bda3a	63
86916dae S	64	error_desc = video_info.get('error_desc')
86916dae S	65	if error_desc:
57ce8a6d RA	66	self.report_warning(
57ce8a6d RA	67	'%s returned error: %s' % (self.IE_NAME, error_desc))
86916dae	68
8244288d	69	chapters = video_info['chapters']
57ce8a6d RA	70	if chapters:
57ce8a6d RA	71	first_chapter = chapters[0]
99afb3dd	72
57ce8a6d RA	73	def video_id_for_chapter(chapter):
57ce8a6d RA	74	return chapter['tc_start'].split('-')[0]
8244288d	75
57ce8a6d RA	76	if video_id_for_chapter(first_chapter) != video_id:
	77	self.to_screen('Multipart video detected')
	78	entries = [self.url_result('wat:%s' % video_id_for_chapter(chapter)) for chapter in chapters]
	79	return self.playlist_result(entries, video_id, video_info['title'])
	80	# Otherwise we can continue and extract just one part, we have to use
	81	# the video id for getting the video url
	82	else:
	83	first_chapter = video_info
c5f51551	84
57ce8a6d	85	title = first_chapter['title']
c5f51551	86
	87	def extract_url(path_template, url_type):
	88	req_url = 'http://www.wat.tv/get/%s' % (path_template % video_id)
	89	head = self._request_webpage(HEADRequest(req_url), video_id, 'Extracting %s url' % url_type)
	90	red_url = head.geturl()
	91	if req_url == red_url:
	92	raise ExtractorError(
	93	'%s said: Sorry, this video is not available from your country.' % self.IE_NAME,
	94	expected=True)
	95	return red_url
	96
c5f51551	97	formats = []
57ce8a6d RA	98	try:
	99	http_url = extract_url('android5/%s.mp4', 'http')
	100	m3u8_url = extract_url('ipad/%s.m3u8', 'm3u8')
	101	m3u8_formats = self._extract_m3u8_formats(
	102	m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls')
	103	formats.extend(m3u8_formats)
	104	formats.extend(self._extract_f4m_formats(
	105	m3u8_url.replace('ios.', 'web.').replace('.m3u8', '.f4m'),
	106	video_id, f4m_id='hds', fatal=False))
	107	for m3u8_format in m3u8_formats:
	108	vbr, abr = m3u8_format.get('vbr'), m3u8_format.get('abr')
	109	if not vbr or not abr:
	110	continue
	111	format_id = m3u8_format['format_id'].replace('hls', 'http')
	112	fmt_url = re.sub(r'%s-\d+00-\d+' % video_id, '%s-%d00-%d' % (video_id, round(vbr / 100), round(abr)), http_url)
	113	if self._is_valid_url(fmt_url, video_id, format_id):
	114	f = m3u8_format.copy()
	115	f.update({
	116	'url': fmt_url,
	117	'format_id': format_id,
	118	'protocol': 'http',
	119	})
	120	formats.append(f)
	121	self._sort_formats(formats)
	122	except ExtractorError:
	123	abr = 64
	124	for vbr, width, height in self._FORMATS:
	125	tbr = vbr + abr
	126	format_id = 'http-%s' % tbr
	127	fmt_url = 'http://dnl.adv.tf1.fr/2/USP-0x0/%s/%s/%s/ssm/%s-%s-64k.mp4' % (video_id[-4:-2], video_id[-2:], video_id, video_id, vbr)
	128	if self._is_valid_url(fmt_url, video_id, format_id):
	129	formats.append({
	130	'format_id': format_id,
	131	'url': fmt_url,
	132	'vbr': vbr,
	133	'abr': abr,
	134	'width': width,
	135	'height': height,
	136	})
	137
	138	date_diffusion = first_chapter.get('date_diffusion') or video_data.get('configv4', {}).get('estatS4')
	139	upload_date = unified_strdate(date_diffusion) if date_diffusion else None
	140	duration = None
	141	files = video_info['files']
	142	if files:
	143	duration = int_or_none(files[0].get('duration'))
a54bda3a	144
e7916255	145	return {
c5f51551	146	'id': video_id,
57ce8a6d RA	147	'title': title,
	148	'thumbnail': first_chapter.get('preview'),
	149	'description': first_chapter.get('description'),
	150	'view_count': int_or_none(video_info.get('views')),
e7916255	151	'upload_date': upload_date,
57ce8a6d	152	'duration': duration,
a54bda3a	153	'formats': formats,
e7916255	154	}