[yt-dlp.git] / youtube_dl / extractor / drbonanza.py

from __future__ import unicode_literals

import json
import re

from .common import InfoExtractor
from ..utils import (
    int_or_none,
    parse_iso8601,
)


class DRBonanzaIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?dr\.dk/bonanza/(?:[^/]+/)+(?:[^/])+?(?:assetId=(?P<id>\d+))?(?:[#&]|$)'

    _TESTS = [{
        'url': 'http://www.dr.dk/bonanza/serie/portraetter/Talkshowet.htm?assetId=65517',
        'info_dict': {
            'id': '65517',
            'ext': 'mp4',
            'title': 'Talkshowet - Leonard Cohen',
            'description': 'md5:8f34194fb30cd8c8a30ad8b27b70c0ca',
            'thumbnail': r're:^https?://.*\.(?:gif|jpg)$',
            'timestamp': 1295537932,
            'upload_date': '20110120',
            'duration': 3664,
        },
        'params': {
            'skip_download': True,  # requires rtmp
        },
    }, {
        'url': 'http://www.dr.dk/bonanza/radio/serie/sport/fodbold.htm?assetId=59410',
        'md5': '6dfe039417e76795fb783c52da3de11d',
        'info_dict': {
            'id': '59410',
            'ext': 'mp3',
            'title': 'EM fodbold 1992 Danmark - Tyskland finale Transmission',
            'description': 'md5:501e5a195749480552e214fbbed16c4e',
            'thumbnail': r're:^https?://.*\.(?:gif|jpg)$',
            'timestamp': 1223274900,
            'upload_date': '20081006',
            'duration': 7369,
        },
    }]

    def _real_extract(self, url):
        url_id = self._match_id(url)
        webpage = self._download_webpage(url, url_id)

        if url_id:
            info = json.loads(self._html_search_regex(r'({.*?%s.*})' % url_id, webpage, 'json'))
        else:
            # Just fetch the first video on that page
            info = json.loads(self._html_search_regex(r'bonanzaFunctions.newPlaylist\(({.*})\)', webpage, 'json'))

        asset_id = str(info['AssetId'])
        title = info['Title'].rstrip(' \'\"-,.:;!?')
        duration = int_or_none(info.get('Duration'), scale=1000)
        # First published online. "FirstPublished" contains the date for original airing.
        timestamp = parse_iso8601(
            re.sub(r'\.\d+$', '', info['Created']))

        def parse_filename_info(url):
            match = re.search(r'/\d+_(?P<width>\d+)x(?P<height>\d+)x(?P<bitrate>\d+)K\.(?P<ext>\w+)$', url)
            if match:
                return {
                    'width': int(match.group('width')),
                    'height': int(match.group('height')),
                    'vbr': int(match.group('bitrate')),
                    'ext': match.group('ext')
                }
            match = re.search(r'/\d+_(?P<bitrate>\d+)K\.(?P<ext>\w+)$', url)
            if match:
                return {
                    'vbr': int(match.group('bitrate')),
                    'ext': match.group(2)
                }
            return {}

        video_types = ['VideoHigh', 'VideoMid', 'VideoLow']
        preferencemap = {
            'VideoHigh': -1,
            'VideoMid': -2,
            'VideoLow': -3,
            'Audio': -4,
        }

        formats = []
        for file in info['Files']:
            if info['Type'] == 'Video':
                if file['Type'] in video_types:
                    format = parse_filename_info(file['Location'])
                    format.update({
                        'url': file['Location'],
                        'format_id': file['Type'].replace('Video', ''),
                        'preference': preferencemap.get(file['Type'], -10),
                    })
                    if format['url'].startswith('rtmp'):
                        rtmp_url = format['url']
                        format['rtmp_live'] = True  # --resume does not work
                        if '/bonanza/' in rtmp_url:
                            format['play_path'] = rtmp_url.split('/bonanza/')[1]
                    formats.append(format)
                elif file['Type'] == 'Thumb':
                    thumbnail = file['Location']
            elif info['Type'] == 'Audio':
                if file['Type'] == 'Audio':
                    format = parse_filename_info(file['Location'])
                    format.update({
                        'url': file['Location'],
                        'format_id': file['Type'],
                        'vcodec': 'none',
                    })
                    formats.append(format)
                elif file['Type'] == 'Thumb':
                    thumbnail = file['Location']

        description = '%s\n%s\n%s\n' % (
            info['Description'], info['Actors'], info['Colophon'])

        self._sort_formats(formats)

        display_id = re.sub(r'[^\w\d-]', '', re.sub(r' ', '-', title.lower())) + '-' + asset_id
        display_id = re.sub(r'-+', '-', display_id)

        return {
            'id': asset_id,
            'display_id': display_id,
            'title': title,
            'formats': formats,
            'description': description,
            'thumbnail': thumbnail,
            'timestamp': timestamp,
            'duration': duration,
        }
Commit	Line	Data
8e7a9016 JJ	1	from __future__ import unicode_literals
8e7a9016 JJ	2
8e7a9016 JJ	3	import json
	4	import re
	5
8e2ec955 PH	6	from .common import InfoExtractor
	7	from ..utils import (
	8	int_or_none,
	9	parse_iso8601,
	10	)
	11
	12
8e7a9016 JJ	13	class DRBonanzaIE(InfoExtractor):
	14	_VALID_URL = r'https?://(?:www\.)?dr\.dk/bonanza/(?:[^/]+/)+(?:[^/])+?(?:assetId=(?P<id>\d+))?(?:[#&]\|$)'
	15
	16	_TESTS = [{
	17	'url': 'http://www.dr.dk/bonanza/serie/portraetter/Talkshowet.htm?assetId=65517',
8e7a9016 JJ	18	'info_dict': {
	19	'id': '65517',
	20	'ext': 'mp4',
	21	'title': 'Talkshowet - Leonard Cohen',
	22	'description': 'md5:8f34194fb30cd8c8a30ad8b27b70c0ca',
ec85ded8	23	'thumbnail': r're:^https?://.*\.(?:gif\|jpg)$',
8e7a9016 JJ	24	'timestamp': 1295537932,
8e7a9016 JJ	25	'upload_date': '20110120',
8e2ec955	26	'duration': 3664,
8e7a9016	27	},
18b5e1e5 YCH	28	'params': {
	29	'skip_download': True, # requires rtmp
	30	},
8e2ec955	31	}, {
8e7a9016 JJ	32	'url': 'http://www.dr.dk/bonanza/radio/serie/sport/fodbold.htm?assetId=59410',
	33	'md5': '6dfe039417e76795fb783c52da3de11d',
	34	'info_dict': {
	35	'id': '59410',
	36	'ext': 'mp3',
	37	'title': 'EM fodbold 1992 Danmark - Tyskland finale Transmission',
	38	'description': 'md5:501e5a195749480552e214fbbed16c4e',
ec85ded8	39	'thumbnail': r're:^https?://.*\.(?:gif\|jpg)$',
8e7a9016 JJ	40	'timestamp': 1223274900,
8e7a9016 JJ	41	'upload_date': '20081006',
8e2ec955	42	'duration': 7369,
8e7a9016 JJ	43	},
	44	}]
	45
	46	def _real_extract(self, url):
	47	url_id = self._match_id(url)
8e2ec955 PH	48	webpage = self._download_webpage(url, url_id)
8e2ec955 PH	49
8e7a9016	50	if url_id:
8e2ec955	51	info = json.loads(self._html_search_regex(r'({.?%s.})' % url_id, webpage, 'json'))
8e7a9016 JJ	52	else:
	53	# Just fetch the first video on that page
	54	info = json.loads(self._html_search_regex(r'bonanzaFunctions.newPlaylist\(({.*})\)', webpage, 'json'))
8e2ec955	55
8e7a9016 JJ	56	asset_id = str(info['AssetId'])
8e7a9016 JJ	57	title = info['Title'].rstrip(' \'\"-,.:;!?')
8e2ec955 PH	58	duration = int_or_none(info.get('Duration'), scale=1000)
	59	# First published online. "FirstPublished" contains the date for original airing.
	60	timestamp = parse_iso8601(
	61	re.sub(r'\.\d+$', '', info['Created']))
	62
8e7a9016 JJ	63	def parse_filename_info(url):
	64	match = re.search(r'/\d+_(?P<width>\d+)x(?P<height>\d+)x(?P<bitrate>\d+)K\.(?P<ext>\w+)$', url)
	65	if match:
8e2ec955 PH	66	return {
	67	'width': int(match.group('width')),
	68	'height': int(match.group('height')),
	69	'vbr': int(match.group('bitrate')),
	70	'ext': match.group('ext')
	71	}
8e7a9016 JJ	72	match = re.search(r'/\d+_(?P<bitrate>\d+)K\.(?P<ext>\w+)$', url)
8e7a9016 JJ	73	if match:
8e2ec955 PH	74	return {
	75	'vbr': int(match.group('bitrate')),
	76	'ext': match.group(2)
	77	}
	78	return {}
	79
8e7a9016 JJ	80	video_types = ['VideoHigh', 'VideoMid', 'VideoLow']
	81	preferencemap = {
	82	'VideoHigh': -1,
	83	'VideoMid': -2,
	84	'VideoLow': -3,
	85	'Audio': -4,
	86	}
8e2ec955	87
8e7a9016 JJ	88	formats = []
8e7a9016 JJ	89	for file in info['Files']:
611c1dd9	90	if info['Type'] == 'Video':
8e7a9016	91	if file['Type'] in video_types:
8e2ec955 PH	92	format = parse_filename_info(file['Location'])
8e2ec955 PH	93	format.update({
8e7a9016 JJ	94	'url': file['Location'],
	95	'format_id': file['Type'].replace('Video', ''),
	96	'preference': preferencemap.get(file['Type'], -10),
8e7a9016	97	})
18b5e1e5 YCH	98	if format['url'].startswith('rtmp'):
	99	rtmp_url = format['url']
	100	format['rtmp_live'] = True # --resume does not work
	101	if '/bonanza/' in rtmp_url:
	102	format['play_path'] = rtmp_url.split('/bonanza/')[1]
8e2ec955	103	formats.append(format)
611c1dd9	104	elif file['Type'] == 'Thumb':
8e7a9016	105	thumbnail = file['Location']
611c1dd9 S	106	elif info['Type'] == 'Audio':
611c1dd9 S	107	if file['Type'] == 'Audio':
8e2ec955 PH	108	format = parse_filename_info(file['Location'])
8e2ec955 PH	109	format.update({
8e7a9016 JJ	110	'url': file['Location'],
8e7a9016 JJ	111	'format_id': file['Type'],
8e7a9016 JJ	112	'vcodec': 'none',
8e7a9016 JJ	113	})
8e2ec955	114	formats.append(format)
611c1dd9	115	elif file['Type'] == 'Thumb':
8e7a9016	116	thumbnail = file['Location']
8e2ec955 PH	117
	118	description = '%s\n%s\n%s\n' % (
	119	info['Description'], info['Actors'], info['Colophon'])
8e7a9016	120
8e7a9016	121	self._sort_formats(formats)
8e2ec955	122
8e7a9016 JJ	123	display_id = re.sub(r'[^\w\d-]', '', re.sub(r' ', '-', title.lower())) + '-' + asset_id
8e7a9016 JJ	124	display_id = re.sub(r'-+', '-', display_id)
8e2ec955	125
8e7a9016 JJ	126	return {
	127	'id': asset_id,
	128	'display_id': display_id,
	129	'title': title,
	130	'formats': formats,
	131	'description': description,
	132	'thumbnail': thumbnail,
	133	'timestamp': timestamp,
	134	'duration': duration,
	135	}