[yt-dlp.git] / youtube_dl / extractor / orf.py

# coding: utf-8
from __future__ import unicode_literals

import json
import re

from .common import InfoExtractor
from ..utils import (
    HEADRequest,
    unified_strdate,
)


class ORFIE(InfoExtractor):
    _VALID_URL = r'https?://tvthek\.orf\.at/(?:programs/.+?/episodes|topics/.+?|program/[^/]+)/(?P<id>\d+)'

    _TEST = {
        'url': 'http://tvthek.orf.at/program/matinee-Was-Sie-schon-immer-ueber-Klassik-wissen-wollten/7317210/Was-Sie-schon-immer-ueber-Klassik-wissen-wollten/7319746/Was-Sie-schon-immer-ueber-Klassik-wissen-wollten/7319747',
        'file': '7319747.mp4',
        'md5': 'bd803c5d8c32d3c64a0ea4b4eeddf375',
        'info_dict': {
            'title': 'Was Sie schon immer über Klassik wissen wollten',
            'description': 'md5:0ddf0d5f0060bd53f744edaa5c2e04a4',
            'duration': 3508,
            'upload_date': '20140105',
        },
        'skip': 'Blocked outside of Austria',
    }

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        playlist_id = mobj.group('id')
        webpage = self._download_webpage(url, playlist_id)

        data_json = self._search_regex(
            r'initializeAdworx\((.+?)\);\n', webpage, 'video info')
        all_data = json.loads(data_json)
        sdata = all_data[0]['values']['segments']

        def quality_to_int(s):
            m = re.search('([0-9]+)', s)
            if m is None:
                return -1
            return int(m.group(1))

        entries = []
        for sd in sdata:
            video_id = sd['id']
            formats = [{
                'preference': -10 if fd['delivery'] == 'hls' else None,
                'format_id': '%s-%s-%s' % (
                    fd['delivery'], fd['quality'], fd['quality_string']),
                'url': fd['src'],
                'protocol': fd['protocol'],
                'quality': quality_to_int(fd['quality']),
            } for fd in sd['playlist_item_array']['sources']]

            # Check for geoblocking.
            # There is a property is_geoprotection, but that's always false
            geo_str = sd.get('geoprotection_string')
            if geo_str:
                try:
                    http_url = next(
                        f['url']
                        for f in formats
                        if re.match(r'^https?://.*\.mp4$', f['url']))
                except StopIteration:
                    pass
                else:
                    req = HEADRequest(http_url)
                    response = self._request_webpage(
                        req, video_id,
                        note='Testing for geoblocking',
                        errnote=((
                            'This video seems to be blocked outside of %s. '
                            'You may want to try the streaming-* formats.')
                            % geo_str),
                        fatal=False)

            self._sort_formats(formats)

            upload_date = unified_strdate(sd['created_date'])
            entries.append({
                '_type': 'video',
                'id': video_id,
                'title': sd['header'],
                'formats': formats,
                'description': sd.get('description'),
                'duration': int(sd['duration_in_seconds']),
                'upload_date': upload_date,
                'thumbnail': sd.get('image_full_url'),
            })

        return {
            '_type': 'playlist',
            'entries': entries,
            'id': playlist_id,
        }
Commit	Line	Data
89284910	1	# coding: utf-8
5d73273f	2	from __future__ import unicode_literals
89284910	3
54543467	4	import json
5d73273f	5	import re
54543467 JMF	6
	7	from .common import InfoExtractor
	8	from ..utils import (
5d73273f PH	9	HEADRequest,
5d73273f PH	10	unified_strdate,
54543467 JMF	11	)
54543467 JMF	12
5d73273f	13
54543467	14	class ORFIE(InfoExtractor):
5d73273f PH	15	_VALID_URL = r'https?://tvthek\.orf\.at/(?:programs/.+?/episodes\|topics/.+?\|program/[^/]+)/(?P<id>\d+)'
	16
	17	_TEST = {
	18	'url': 'http://tvthek.orf.at/program/matinee-Was-Sie-schon-immer-ueber-Klassik-wissen-wollten/7317210/Was-Sie-schon-immer-ueber-Klassik-wissen-wollten/7319746/Was-Sie-schon-immer-ueber-Klassik-wissen-wollten/7319747',
	19	'file': '7319747.mp4',
	20	'md5': 'bd803c5d8c32d3c64a0ea4b4eeddf375',
	21	'info_dict': {
	22	'title': 'Was Sie schon immer über Klassik wissen wollten',
	23	'description': 'md5:0ddf0d5f0060bd53f744edaa5c2e04a4',
	24	'duration': 3508,
	25	'upload_date': '20140105',
	26	},
	27	'skip': 'Blocked outside of Austria',
	28	}
54543467	29
54543467 JMF	30	def _real_extract(self, url):
	31	mobj = re.match(self._VALID_URL, url)
	32	playlist_id = mobj.group('id')
	33	webpage = self._download_webpage(url, playlist_id)
	34
5d73273f PH	35	data_json = self._search_regex(
	36	r'initializeAdworx\((.+?)\);\n', webpage, 'video info')
	37	all_data = json.loads(data_json)
	38	sdata = all_data[0]['values']['segments']
	39
	40	def quality_to_int(s):
	41	m = re.search('([0-9]+)', s)
	42	if m is None:
	43	return -1
	44	return int(m.group(1))
	45
	46	entries = []
	47	for sd in sdata:
	48	video_id = sd['id']
	49	formats = [{
	50	'preference': -10 if fd['delivery'] == 'hls' else None,
	51	'format_id': '%s-%s-%s' % (
	52	fd['delivery'], fd['quality'], fd['quality_string']),
	53	'url': fd['src'],
	54	'protocol': fd['protocol'],
	55	'quality': quality_to_int(fd['quality']),
	56	} for fd in sd['playlist_item_array']['sources']]
	57
	58	# Check for geoblocking.
	59	# There is a property is_geoprotection, but that's always false
	60	geo_str = sd.get('geoprotection_string')
	61	if geo_str:
	62	try:
	63	http_url = next(
	64	f['url']
	65	for f in formats
	66	if re.match(r'^https?://.*\.mp4$', f['url']))
	67	except StopIteration:
	68	pass
	69	else:
	70	req = HEADRequest(http_url)
	71	response = self._request_webpage(
	72	req, video_id,
	73	note='Testing for geoblocking',
	74	errnote=((
	75	'This video seems to be blocked outside of %s. '
	76	'You may want to try the streaming-* formats.')
	77	% geo_str),
	78	fatal=False)
	79
	80	self._sort_formats(formats)
	81
	82	upload_date = unified_strdate(sd['created_date'])
	83	entries.append({
54543467	84	'_type': 'video',
5d73273f PH	85	'id': video_id,
	86	'title': sd['header'],
	87	'formats': formats,
	88	'description': sd.get('description'),
	89	'duration': int(sd['duration_in_seconds']),
	90	'upload_date': upload_date,
	91	'thumbnail': sd.get('image_full_url'),
	92	})
	93
	94	return {
	95	'_type': 'playlist',
	96	'entries': entries,
	97	'id': playlist_id,
	98	}