[yt-dlp.git] / youtube_dl / extractor / expressen.py

# coding: utf-8
from __future__ import unicode_literals

from .common import InfoExtractor
from ..utils import (
    determine_ext,
    int_or_none,
    unescapeHTML,
    unified_timestamp,
)


class ExpressenIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?expressen\.se/tv/(?:[^/]+/)*(?P<id>[^/?#&]+)'
    _TESTS = [{
        'url': 'https://www.expressen.se/tv/ledare/ledarsnack/ledarsnack-om-arbetslosheten-bland-kvinnor-i-speciellt-utsatta-omraden/',
        'md5': '2fbbe3ca14392a6b1b36941858d33a45',
        'info_dict': {
            'id': '8690962',
            'ext': 'mp4',
            'title': 'Ledarsnack: Om arbetslösheten bland kvinnor i speciellt utsatta områden',
            'description': 'md5:f38c81ff69f3de4d269bbda012fcbbba',
            'thumbnail': r're:^https?://.*\.jpg$',
            'duration': 788,
            'timestamp': 1526639109,
            'upload_date': '20180518',
        },
    }, {
        'url': 'https://www.expressen.se/tv/kultur/kulturdebatt-med-expressens-karin-olsson/',
        'only_matching': True,
    }]

    def _real_extract(self, url):
        display_id = self._match_id(url)

        webpage = self._download_webpage(url, display_id)

        def extract_data(name):
            return self._parse_json(
                self._search_regex(
                    r'data-%s=(["\'])(?P<value>(?:(?!\1).)+)\1' % name,
                    webpage, 'info', group='value'),
                display_id, transform_source=unescapeHTML)

        info = extract_data('video-tracking-info')
        video_id = info['videoId']

        data = extract_data('article-data')
        stream = data['stream']

        if determine_ext(stream) == 'm3u8':
            formats = self._extract_m3u8_formats(
                stream, display_id, 'mp4', entry_protocol='m3u8_native',
                m3u8_id='hls')
        else:
            formats = [{
                'url': stream,
            }]
        self._sort_formats(formats)

        title = info.get('titleRaw') or data['title']
        description = info.get('descriptionRaw')
        thumbnail = info.get('socialMediaImage') or data.get('image')
        duration = int_or_none(info.get('videoTotalSecondsDuration') or
                               data.get('totalSecondsDuration'))
        timestamp = unified_timestamp(info.get('publishDate'))

        return {
            'id': video_id,
            'display_id': display_id,
            'title': title,
            'description': description,
            'thumbnail': thumbnail,
            'duration': duration,
            'timestamp': timestamp,
            'formats': formats,
        }
Commit	Line	Data
734d461c S	1	# coding: utf-8
	2	from __future__ import unicode_literals
	3
	4	from .common import InfoExtractor
	5	from ..utils import (
	6	determine_ext,
	7	int_or_none,
	8	unescapeHTML,
	9	unified_timestamp,
	10	)
	11
	12
	13	class ExpressenIE(InfoExtractor):
	14	_VALID_URL = r'https?://(?:www\.)?expressen\.se/tv/(?:[^/]+/)*(?P<id>[^/?#&]+)'
	15	_TESTS = [{
	16	'url': 'https://www.expressen.se/tv/ledare/ledarsnack/ledarsnack-om-arbetslosheten-bland-kvinnor-i-speciellt-utsatta-omraden/',
	17	'md5': '2fbbe3ca14392a6b1b36941858d33a45',
	18	'info_dict': {
	19	'id': '8690962',
	20	'ext': 'mp4',
	21	'title': 'Ledarsnack: Om arbetslösheten bland kvinnor i speciellt utsatta områden',
	22	'description': 'md5:f38c81ff69f3de4d269bbda012fcbbba',
	23	'thumbnail': r're:^https?://.*\.jpg$',
	24	'duration': 788,
	25	'timestamp': 1526639109,
	26	'upload_date': '20180518',
	27	},
	28	}, {
	29	'url': 'https://www.expressen.se/tv/kultur/kulturdebatt-med-expressens-karin-olsson/',
	30	'only_matching': True,
	31	}]
	32
	33	def _real_extract(self, url):
	34	display_id = self._match_id(url)
	35
	36	webpage = self._download_webpage(url, display_id)
	37
	38	def extract_data(name):
	39	return self._parse_json(
	40	self._search_regex(
	41	r'data-%s=(["\'])(?P<value>(?:(?!\1).)+)\1' % name,
	42	webpage, 'info', group='value'),
	43	display_id, transform_source=unescapeHTML)
	44
	45	info = extract_data('video-tracking-info')
	46	video_id = info['videoId']
	47
	48	data = extract_data('article-data')
	49	stream = data['stream']
	50
	51	if determine_ext(stream) == 'm3u8':
	52	formats = self._extract_m3u8_formats(
	53	stream, display_id, 'mp4', entry_protocol='m3u8_native',
	54	m3u8_id='hls')
	55	else:
	56	formats = [{
	57	'url': stream,
	58	}]
	59	self._sort_formats(formats)
	60
	61	title = info.get('titleRaw') or data['title']
	62	description = info.get('descriptionRaw')
	63	thumbnail = info.get('socialMediaImage') or data.get('image')
	64	duration = int_or_none(info.get('videoTotalSecondsDuration') or
65	data.get('totalSecondsDuration'))
66	timestamp = unified_timestamp(info.get('publishDate'))
67
68	return {
69	'id': video_id,
70	'display_id': display_id,
71	'title': title,
72	'description': description,
73	'thumbnail': thumbnail,
74	'duration': duration,
75	'timestamp': timestamp,
76	'formats': formats,
77	}