[yt-dlp.git] / yt_dlp / extractor / la7.py

# coding: utf-8
from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..utils import (
    determine_ext,
    float_or_none,
    parse_duration,
    smuggle_url,
    unified_strdate,
)


class LA7IE(InfoExtractor):
    IE_NAME = 'la7.it'
    _VALID_URL = r'''(?x)(https?://)?(?:
        (?:www\.)?la7\.it/([^/]+)/(?:rivedila7|video)/|
        tg\.la7\.it/repliche-tgla7\?id=
    )(?P<id>.+)'''

    _TESTS = [{
        # 'src' is a plain URL
        'url': 'http://www.la7.it/crozza/video/inccool8-02-10-2015-163722',
        'md5': '8b613ffc0c4bf9b9e377169fc19c214c',
        'info_dict': {
            'id': '0_42j6wd36',
            'ext': 'mp4',
            'title': 'Inc.Cool8',
            'description': 'Benvenuti nell\'incredibile mondo della INC. COOL. 8. dove “INC.” sta per “Incorporated” “COOL” sta per “fashion” ed Eight sta per il gesto atletico',
            'thumbnail': 're:^https?://.*',
            'uploader_id': 'kdla7pillole@iltrovatore.it',
            'timestamp': 1443814869,
            'upload_date': '20151002',
        },
    }, {
        'url': 'http://www.la7.it/omnibus/rivedila7/omnibus-news-02-07-2016-189077',
        'only_matching': True,
    }]

    def _real_extract(self, url):
        video_id = self._match_id(url)

        if not url.startswith('http'):
            url = '%s//%s' % (self.http_scheme(), url)

        webpage = self._download_webpage(url, video_id)

        player_data = self._search_regex(
            [r'(?s)videoParams\s*=\s*({.+?});', r'videoLa7\(({[^;]+})\);'],
            webpage, 'player data')
        vid = self._search_regex(r'vid\s*:\s*"(.+?)",', player_data, 'vid')

        return {
            '_type': 'url_transparent',
            'url': smuggle_url('kaltura:103:%s' % vid, {
                'service_url': 'http://nkdam.iltrovatore.it',
            }),
            'id': video_id,
            'title': self._og_search_title(webpage, default=None),
            'description': self._og_search_description(webpage, default=None),
            'thumbnail': self._og_search_thumbnail(webpage, default=None),
            'ie_key': 'Kaltura',
        }


class LA7PodcastEpisodeIE(InfoExtractor):
    IE_NAME = 'la7.it:pod:episode'
    _VALID_URL = r'''(?x)(https?://)?
        (?:www\.)?la7\.it/[^/]+/podcast/([^/]+-)?(?P<id>\d+)'''

    _TESTS = [{
        'url': 'https://www.la7.it/voicetown/podcast/la-carezza-delle-memoria-di-carlo-verdone-23-03-2021-371497',
        'md5': '7737d4d79b3c1a34b3de3e16297119ed',
        'info_dict': {
            'id': '371497',
            'ext': 'mp3',
            'title': '"La carezza delle memoria" di Carlo Verdone',
            'description': 'md5:5abf07c3c551a687db80af3f9ceb7d52',
            'thumbnail': 'https://www.la7.it/sites/default/files/podcast/371497.jpg',
            'upload_date': '20210323',
        },
    }, {
        # embed url
        'url': 'https://www.la7.it/embed/podcast/371497',
        'only_matching': True,
    }, {
        # date already in the title
        'url': 'https://www.la7.it/propagandalive/podcast/lintervista-di-diego-bianchi-ad-annalisa-cuzzocrea-puntata-del-1932021-20-03-2021-371130',
        'only_matching': True,
    }, {
        # title same as show_title
        'url': 'https://www.la7.it/otto-e-mezzo/podcast/otto-e-mezzo-26-03-2021-372340',
        'only_matching': True,
    }]

    def _extract_info(self, webpage, video_id=None, ppn=None):
        if not video_id:
            video_id = self._search_regex(
                r'data-nid=([\'"])(?P<vid>\d+)\1',
                webpage, 'video_id', group='vid')

        media_url = self._search_regex(
            (r'src:\s*([\'"])(?P<url>.+?mp3.+?)\1',
             r'data-podcast=([\'"])(?P<url>.+?mp3.+?)\1'),
            webpage, 'media_url', group='url')
        ext = determine_ext(media_url)
        formats = [{
            'url': media_url,
            'format_id': ext,
            'ext': ext,
        }]
        self._sort_formats(formats)

        title = self._html_search_regex(
            (r'<div class="title">(?P<title>.+?)</',
             r'<title>(?P<title>[^<]+)</title>',
             r'title:\s*([\'"])(?P<title>.+?)\1'),
            webpage, 'title', group='title')

        description = (
            self._html_search_regex(
                (r'<div class="description">(.+?)</div>',
                 r'<div class="description-mobile">(.+?)</div>',
                 r'<div class="box-txt">([^<]+?)</div>',
                 r'<div class="field-content"><p>(.+?)</p></div>'),
                webpage, 'description', default=None)
            or self._html_search_meta('description', webpage))

        thumb = self._html_search_regex(
            (r'<div class="podcast-image"><img src="(.+?)"></div>',
             r'<div class="container-embed"[^<]+url\((.+?)\);">',
             r'<div class="field-content"><img src="(.+?)"'),
            webpage, 'thumbnail', fatal=False, default=None)

        duration = parse_duration(self._html_search_regex(
            r'<span class="(?:durata|duration)">([\d:]+)</span>',
            webpage, 'duration', fatal=False, default=None))

        date = self._html_search_regex(
            r'class="data">\s*(?:<span>)?([\d\.]+)\s*</',
            webpage, 'date', default=None)

        date_alt = self._search_regex(
            r'(\d+[\./]\d+[\./]\d+)', title, 'date_alt', default=None)
        ppn = ppn or self._search_regex(
            r'ppN:\s*([\'"])(?P<ppn>.+?)\1',
            webpage, 'ppn', group='ppn', default=None)
        # if the date is not in the title
        # and title is the same as the show_title
        # add the date to the title
        if date and not date_alt and ppn and ppn.lower() == title.lower():
            title += ' del %s' % date
        return {
            'id': video_id,
            'title': title,
            'description': description,
            'duration': float_or_none(duration),
            'formats': formats,
            'thumbnail': thumb,
            'upload_date': unified_strdate(date),
        }

    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)

        return self._extract_info(webpage, video_id)


class LA7PodcastIE(LA7PodcastEpisodeIE):
    IE_NAME = 'la7.it:podcast'
    _VALID_URL = r'(https?://)?(www\.)?la7\.it/(?P<id>[^/]+)/podcast/?(?:$|[#?])'

    _TESTS = [{
        'url': 'https://www.la7.it/propagandalive/podcast',
        'info_dict': {
            'id': 'propagandalive',
            'title': "Propaganda Live",
        },
        'playlist_count': 10,
    }]

    def _real_extract(self, url):
        playlist_id = self._match_id(url)
        webpage = self._download_webpage(url, playlist_id)

        title = (
            self._html_search_regex(
                r'<h1.*?>(.+?)</h1>', webpage, 'title', fatal=False, default=None)
            or self._og_search_title(webpage))
        ppn = self._search_regex(
            r'window\.ppN\s*=\s*([\'"])(?P<ppn>.+?)\1',
            webpage, 'ppn', group='ppn', default=None)

        entries = []
        for episode in re.finditer(
                r'<div class="container-podcast-property">([\s\S]+?)(?:</div>\s*){3}',
                webpage):
            entries.append(self._extract_info(episode.group(1), ppn=ppn))

        return self.playlist_result(entries, playlist_id, title)
Commit	Line	Data
712b0b5b	1	# coding: utf-8
a17d16d5 PH	2	from __future__ import unicode_literals
a17d16d5 PH	3
a6ae61a4	4	import re
a6ae61a4	5
a17d16d5 PH	6	from .common import InfoExtractor
a17d16d5 PH	7	from ..utils import (
a6ae61a4	8	determine_ext,
	9	float_or_none,
	10	parse_duration,
dafafe7c	11	smuggle_url,
a6ae61a4	12	unified_strdate,
a17d16d5 PH	13	)
	14
	15
	16	class LA7IE(InfoExtractor):
712b0b5b YCH	17	IE_NAME = 'la7.it'
	18	_VALID_URL = r'''(?x)(https?://)?(?:
	19	(?:www\.)?la7\.it/([^/]+)/(?:rivedila7\|video)/\|
	20	tg\.la7\.it/repliche-tgla7\?id=
	21	)(?P<id>.+)'''
	22
	23	_TESTS = [{
	24	# 'src' is a plain URL
	25	'url': 'http://www.la7.it/crozza/video/inccool8-02-10-2015-163722',
dafafe7c	26	'md5': '8b613ffc0c4bf9b9e377169fc19c214c',
712b0b5b	27	'info_dict': {
0f9d5356	28	'id': '0_42j6wd36',
712b0b5b YCH	29	'ext': 'mp4',
712b0b5b YCH	30	'title': 'Inc.Cool8',
8067a2b8	31	'description': 'Benvenuti nell\'incredibile mondo della INC. COOL. 8. dove “INC.” sta per “Incorporated” “COOL” sta per “fashion” ed Eight sta per il gesto atletico',
712b0b5b	32	'thumbnail': 're:^https?://.*',
dafafe7c RA	33	'uploader_id': 'kdla7pillole@iltrovatore.it',
	34	'timestamp': 1443814869,
	35	'upload_date': '20151002',
712b0b5b	36	},
712b0b5b YCH	37	}, {
	38	'url': 'http://www.la7.it/omnibus/rivedila7/omnibus-news-02-07-2016-189077',
	39	'only_matching': True,
	40	}]
a17d16d5 PH	41
a17d16d5 PH	42	def _real_extract(self, url):
228d30ed	43	video_id = self._match_id(url)
712b0b5b	44
ab36800b	45	if not url.startswith('http'):
	46	url = '%s//%s' % (self.http_scheme(), url)
	47
712b0b5b YCH	48	webpage = self._download_webpage(url, video_id)
712b0b5b YCH	49
8067a2b8 LT	50	player_data = self._search_regex(
	51	[r'(?s)videoParams\s=\s({.+?});', r'videoLa7\(({[^;]+})\);'],
	52	webpage, 'player data')
	53	vid = self._search_regex(r'vid\s:\s"(.+?)",', player_data, 'vid')
712b0b5b	54
a17d16d5	55	return {
dafafe7c	56	'_type': 'url_transparent',
8067a2b8	57	'url': smuggle_url('kaltura:103:%s' % vid, {
0f9d5356	58	'service_url': 'http://nkdam.iltrovatore.it',
dafafe7c	59	}),
a17d16d5	60	'id': video_id,
8067a2b8	61	'title': self._og_search_title(webpage, default=None),
712b0b5b	62	'description': self._og_search_description(webpage, default=None),
8067a2b8	63	'thumbnail': self._og_search_thumbnail(webpage, default=None),
dafafe7c	64	'ie_key': 'Kaltura',
a17d16d5	65	}
a6ae61a4	66
	67
	68	class LA7PodcastEpisodeIE(InfoExtractor):
	69	IE_NAME = 'la7.it:pod:episode'
	70	_VALID_URL = r'''(?x)(https?://)?
	71	(?:www\.)?la7\.it/[^/]+/podcast/([^/]+-)?(?P<id>\d+)'''
	72
	73	_TESTS = [{
	74	'url': 'https://www.la7.it/voicetown/podcast/la-carezza-delle-memoria-di-carlo-verdone-23-03-2021-371497',
	75	'md5': '7737d4d79b3c1a34b3de3e16297119ed',
	76	'info_dict': {
	77	'id': '371497',
	78	'ext': 'mp3',
	79	'title': '"La carezza delle memoria" di Carlo Verdone',
	80	'description': 'md5:5abf07c3c551a687db80af3f9ceb7d52',
	81	'thumbnail': 'https://www.la7.it/sites/default/files/podcast/371497.jpg',
	82	'upload_date': '20210323',
	83	},
	84	}, {
	85	# embed url
	86	'url': 'https://www.la7.it/embed/podcast/371497',
	87	'only_matching': True,
	88	}, {
	89	# date already in the title
	90	'url': 'https://www.la7.it/propagandalive/podcast/lintervista-di-diego-bianchi-ad-annalisa-cuzzocrea-puntata-del-1932021-20-03-2021-371130',
	91	'only_matching': True,
	92	}, {
	93	# title same as show_title
	94	'url': 'https://www.la7.it/otto-e-mezzo/podcast/otto-e-mezzo-26-03-2021-372340',
	95	'only_matching': True,
	96	}]
	97
	98	def _extract_info(self, webpage, video_id=None, ppn=None):
	99	if not video_id:
	100	video_id = self._search_regex(
	101	r'data-nid=([\'"])(?P<vid>\d+)\1',
	102	webpage, 'video_id', group='vid')
	103
	104	media_url = self._search_regex(
	105	(r'src:\s*([\'"])(?P<url>.+?mp3.+?)\1',
	106	r'data-podcast=([\'"])(?P<url>.+?mp3.+?)\1'),
	107	webpage, 'media_url', group='url')
	108	ext = determine_ext(media_url)
	109	formats = [{
	110	'url': media_url,
	111	'format_id': ext,
	112	'ext': ext,
	113	}]
	114	self._sort_formats(formats)
	115
	116	title = self._html_search_regex(
	117	(r'<div class="title">(?P<title>.+?)</',
	118	r'<title>(?P<title>[^<]+)</title>',
	119	r'title:\s*([\'"])(?P<title>.+?)\1'),
	120	webpage, 'title', group='title')
	121
	122	description = (
	123	self._html_search_regex(
	124	(r'<div class="description">(.+?)</div>',
	125	r'<div class="description-mobile">(.+?)</div>',
	126	r'<div class="box-txt">([^<]+?)</div>',
	127	r'<div class="field-content"><p>(.+?)</p></div>'),
	128	webpage, 'description', default=None)
	129	or self._html_search_meta('description', webpage))
130
131	thumb = self._html_search_regex(
132	(r'<div class="podcast-image"><img src="(.+?)"></div>',
133	r'<div class="container-embed"[^<]+url\((.+?)\);">',
134	r'<div class="field-content"><img src="(.+?)"'),
135	webpage, 'thumbnail', fatal=False, default=None)
136
137	duration = parse_duration(self._html_search_regex(
138	r'<span class="(?:durata\|duration)">([\d:]+)</span>',
139	webpage, 'duration', fatal=False, default=None))
140
141	date = self._html_search_regex(
142	r'class="data">\s(?:<span>)?([\d\.]+)\s</',
143	webpage, 'date', default=None)
144
145	date_alt = self._search_regex(
146	r'(\d+[\./]\d+[\./]\d+)', title, 'date_alt', default=None)
147	ppn = ppn or self._search_regex(
148	r'ppN:\s*([\'"])(?P<ppn>.+?)\1',
149	webpage, 'ppn', group='ppn', default=None)
150	# if the date is not in the title
151	# and title is the same as the show_title
152	# add the date to the title
153	if date and not date_alt and ppn and ppn.lower() == title.lower():
154	title += ' del %s' % date
155	return {
156	'id': video_id,
157	'title': title,
158	'description': description,
159	'duration': float_or_none(duration),
160	'formats': formats,
161	'thumbnail': thumb,
162	'upload_date': unified_strdate(date),
163	}
164
165	def _real_extract(self, url):
166	video_id = self._match_id(url)
167	webpage = self._download_webpage(url, video_id)
168
169	return self._extract_info(webpage, video_id)
170
171
172	class LA7PodcastIE(LA7PodcastEpisodeIE):
173	IE_NAME = 'la7.it:podcast'
174	_VALID_URL = r'(https?://)?(www\.)?la7\.it/(?P<id>[^/]+)/podcast/?(?:$\|[#?])'
175
176	_TESTS = [{
177	'url': 'https://www.la7.it/propagandalive/podcast',
178	'info_dict': {
179	'id': 'propagandalive',
180	'title': "Propaganda Live",
181	},
182	'playlist_count': 10,
183	}]
184
185	def _real_extract(self, url):
186	playlist_id = self._match_id(url)
187	webpage = self._download_webpage(url, playlist_id)
188
189	title = (
190	self._html_search_regex(
191	r'<h1.*?>(.+?)</h1>', webpage, 'title', fatal=False, default=None)
192	or self._og_search_title(webpage))
193	ppn = self._search_regex(
194	r'window\.ppN\s=\s([\'"])(?P<ppn>.+?)\1',
195	webpage, 'ppn', group='ppn', default=None)
196
197	entries = []
198	for episode in re.finditer(
199	r'<div class="container-podcast-property">([\s\S]+?)(?:</div>\s*){3}',
200	webpage):
201	entries.append(self._extract_info(episode.group(1), ppn=ppn))
202
203	return self.playlist_result(entries, playlist_id, title)