jfr.im git - yt-dlp.git/blame_incremental

... / ...

Commit	Line	Data
	1	import re
	2
	3	from .common import InfoExtractor
	4	from ..utils import (
	5	determine_ext,
	6	float_or_none,
	7	HEADRequest,
	8	int_or_none,
	9	parse_duration,
	10	unified_strdate,
	11	)
	12
	13
	14	class LA7IE(InfoExtractor):
	15	IE_NAME = 'la7.it'
	16	_VALID_URL = r'''(?x)(https?://)?(?:
	17	(?:www\.)?la7\.it/([^/]+)/(?:rivedila7\|video)/\|
	18	tg\.la7\.it/repliche-tgla7\?id=
	19	)(?P<id>.+)'''
	20
	21	_TESTS = [{
	22	# 'src' is a plain URL
	23	'url': 'http://www.la7.it/crozza/video/inccool8-02-10-2015-163722',
	24	'md5': '8b613ffc0c4bf9b9e377169fc19c214c',
	25	'info_dict': {
	26	'id': 'inccool8-02-10-2015-163722',
	27	'ext': 'mp4',
	28	'title': 'Inc.Cool8',
	29	'description': 'Benvenuti nell\'incredibile mondo della INC. COOL. 8. dove “INC.” sta per “Incorporated” “COOL” sta per “fashion” ed Eight sta per il gesto atletico',
	30	'thumbnail': 're:^https?://.*',
	31	'upload_date': '20151002',
	32	},
	33	}, {
	34	'url': 'http://www.la7.it/omnibus/rivedila7/omnibus-news-02-07-2016-189077',
	35	'only_matching': True,
	36	}]
	37	_HOST = 'https://awsvodpkg.iltrovatore.it'
	38
	39	def _generate_mp4_url(self, quality, m3u8_formats):
	40	for f in m3u8_formats:
	41	if f['vcodec'] != 'none' and quality in f['url']:
	42	http_url = '%s%s.mp4' % (self._HOST, quality)
	43
	44	urlh = self._request_webpage(
	45	HEADRequest(http_url), quality,
	46	note='Check filesize', fatal=False)
	47	if urlh:
	48	http_f = f.copy()
	49	del http_f['manifest_url']
	50	http_f.update({
	51	'format_id': http_f['format_id'].replace('hls-', 'https-'),
	52	'url': http_url,
	53	'protocol': 'https',
	54	'filesize_approx': int_or_none(urlh.headers.get('Content-Length', None)),
	55	})
	56	return http_f
	57	return None
	58
	59	def _real_extract(self, url):
	60	video_id = self._match_id(url)
	61
	62	if not url.startswith('http'):
	63	url = '%s//%s' % (self.http_scheme(), url)
	64
	65	webpage = self._download_webpage(url, video_id)
	66	video_path = self._search_regex(r'(/content/.*?).mp4', webpage, 'video_path')
	67
	68	formats = self._extract_mpd_formats(
	69	f'{self._HOST}/local/dash/,{video_path}.mp4.urlset/manifest.mpd',
	70	video_id, mpd_id='dash', fatal=False)
	71	m3u8_formats = self._extract_m3u8_formats(
	72	f'{self._HOST}/local/hls/,{video_path}.mp4.urlset/master.m3u8',
	73	video_id, 'mp4', m3u8_id='hls', fatal=False)
	74	formats.extend(m3u8_formats)
	75
	76	for q in filter(None, video_path.split(',')):
	77	http_f = self._generate_mp4_url(q, m3u8_formats)
	78	if http_f:
	79	formats.append(http_f)
	80
	81	self._sort_formats(formats)
	82
	83	return {
	84	'id': video_id,
	85	'title': self._og_search_title(webpage, default=None),
	86	'description': self._og_search_description(webpage, default=None),
	87	'thumbnail': self._og_search_thumbnail(webpage, default=None),
	88	'formats': formats,
	89	'upload_date': unified_strdate(self._search_regex(r'datetime="(.+?)"', webpage, 'upload_date', fatal=False))
	90	}
	91
	92
	93	class LA7PodcastEpisodeIE(InfoExtractor):
	94	IE_NAME = 'la7.it:pod:episode'
	95	_VALID_URL = r'''(?x)(https?://)?
	96	(?:www\.)?la7\.it/[^/]+/podcast/([^/]+-)?(?P<id>\d+)'''
	97
	98	_TESTS = [{
	99	'url': 'https://www.la7.it/voicetown/podcast/la-carezza-delle-memoria-di-carlo-verdone-23-03-2021-371497',
	100	'md5': '7737d4d79b3c1a34b3de3e16297119ed',
	101	'info_dict': {
	102	'id': '371497',
	103	'ext': 'mp3',
	104	'title': '"La carezza delle memoria" di Carlo Verdone',
	105	'description': 'md5:5abf07c3c551a687db80af3f9ceb7d52',
	106	'thumbnail': 'https://www.la7.it/sites/default/files/podcast/371497.jpg',
	107	'upload_date': '20210323',
	108	},
	109	}, {
	110	# embed url
	111	'url': 'https://www.la7.it/embed/podcast/371497',
	112	'only_matching': True,
	113	}, {
	114	# date already in the title
	115	'url': 'https://www.la7.it/propagandalive/podcast/lintervista-di-diego-bianchi-ad-annalisa-cuzzocrea-puntata-del-1932021-20-03-2021-371130',
	116	'only_matching': True,
	117	}, {
	118	# title same as show_title
	119	'url': 'https://www.la7.it/otto-e-mezzo/podcast/otto-e-mezzo-26-03-2021-372340',
	120	'only_matching': True,
	121	}]
	122
	123	def _extract_info(self, webpage, video_id=None, ppn=None):
	124	if not video_id:
	125	video_id = self._search_regex(
	126	r'data-nid=([\'"])(?P<vid>\d+)\1',
	127	webpage, 'video_id', group='vid')
	128
	129	media_url = self._search_regex(
	130	(r'src:\s*([\'"])(?P<url>.+?mp3.+?)\1',
	131	r'data-podcast=([\'"])(?P<url>.+?mp3.+?)\1'),
	132	webpage, 'media_url', group='url')
	133	ext = determine_ext(media_url)
	134	formats = [{
	135	'url': media_url,
	136	'format_id': ext,
	137	'ext': ext,
	138	}]
	139	self._sort_formats(formats)
	140
	141	title = self._html_search_regex(
	142	(r'<div class="title">(?P<title>.+?)</',
	143	r'<title>(?P<title>[^<]+)</title>',
	144	r'title:\s*([\'"])(?P<title>.+?)\1'),
	145	webpage, 'title', group='title')
	146
	147	description = (
	148	self._html_search_regex(
	149	(r'<div class="description">(.+?)</div>',
	150	r'<div class="description-mobile">(.+?)</div>',
	151	r'<div class="box-txt">([^<]+?)</div>',
	152	r'<div class="field-content"><p>(.+?)</p></div>'),
	153	webpage, 'description', default=None)
	154	or self._html_search_meta('description', webpage))
	155
	156	thumb = self._html_search_regex(
	157	(r'<div class="podcast-image"><img src="(.+?)"></div>',
	158	r'<div class="container-embed"[^<]+url$(.+?)$;">',
	159	r'<div class="field-content"><img src="(.+?)"'),
	160	webpage, 'thumbnail', fatal=False, default=None)
	161
	162	duration = parse_duration(self._html_search_regex(
	163	r'<span class="(?:durata\|duration)">([\d:]+)</span>',
	164	webpage, 'duration', fatal=False, default=None))
	165
	166	date = self._html_search_regex(
	167	r'class="data">\s(?:<span>)?([\d\.]+)\s</',
	168	webpage, 'date', default=None)
	169
	170	date_alt = self._search_regex(
	171	r'(\d+[\./]\d+[\./]\d+)', title, 'date_alt', default=None)
	172	ppn = ppn or self._search_regex(
	173	r'ppN:\s*([\'"])(?P<ppn>.+?)\1',
	174	webpage, 'ppn', group='ppn', default=None)
	175	# if the date is not in the title
	176	# and title is the same as the show_title
	177	# add the date to the title
	178	if date and not date_alt and ppn and ppn.lower() == title.lower():
	179	title += ' del %s' % date
	180	return {
	181	'id': video_id,
	182	'title': title,
	183	'description': description,
	184	'duration': float_or_none(duration),
	185	'formats': formats,
	186	'thumbnail': thumb,
	187	'upload_date': unified_strdate(date),
	188	}
	189
	190	def _real_extract(self, url):
	191	video_id = self._match_id(url)
	192	webpage = self._download_webpage(url, video_id)
	193
	194	return self._extract_info(webpage, video_id)
	195
	196
	197	class LA7PodcastIE(LA7PodcastEpisodeIE):
	198	IE_NAME = 'la7.it:podcast'
	199	_VALID_URL = r'(https?://)?(www\.)?la7\.it/(?P<id>[^/]+)/podcast/?(?:$\|[#?])'
	200
	201	_TESTS = [{
	202	'url': 'https://www.la7.it/propagandalive/podcast',
	203	'info_dict': {
	204	'id': 'propagandalive',
	205	'title': "Propaganda Live",
	206	},
	207	'playlist_count': 10,
	208	}]
	209
	210	def _real_extract(self, url):
	211	playlist_id = self._match_id(url)
	212	webpage = self._download_webpage(url, playlist_id)
	213
	214	title = (
	215	self._html_search_regex(
	216	r'<h1.*?>(.+?)</h1>', webpage, 'title', fatal=False, default=None)
	217	or self._og_search_title(webpage))
	218	ppn = self._search_regex(
	219	r'window\.ppN\s=\s([\'"])(?P<ppn>.+?)\1',
	220	webpage, 'ppn', group='ppn', default=None)
	221
	222	entries = []
	223	for episode in re.finditer(
	224	r'<div class="container-podcast-property">([\s\S]+?)(?:</div>\s*){3}',
	225	webpage):
	226	entries.append(self._extract_info(episode.group(1), ppn=ppn))
	227
	228	return self.playlist_result(entries, playlist_id, title)